From d8fba1b7a18f1408a3e1e13dfff424ad4335fd8f Mon Sep 17 00:00:00 2001 From: Chris McBride <3595025+ankona@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:35:54 -0500 Subject: [PATCH] Add support for producing dashboard outputs (#426) Add support for producing & consuming telemetry outputs. - Adds telemetry monitor to check for updates and produce events for the dashboard - Updates controller to conditionally start telemetry monitor - Updates controller to produce a runtime manifest to trigger telemetry collection - Adds indirect proxy to produce events for the dashboard for unmanaged tasks - Adds CLI capability to launch dashboard [ committed by @ankona, @MattToast, @AlyssaCote ] [ reviewed by @al-rigazzi, @ashao ] --------- Co-authored-by: Matt Drozt Co-authored-by: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> --- .gitignore | 5 + conftest.py | 13 +- doc/index.rst | 6 + doc/smartdashboard.rst | 7 + docker/docs/dev/Dockerfile | 6 + pyproject.toml | 1 + setup.py | 1 + smartsim/_core/_cli/__main__.py | 19 +- smartsim/_core/_cli/build.py | 10 +- smartsim/_core/_cli/clean.py | 9 +- smartsim/_core/_cli/cli.py | 57 +- smartsim/_core/_cli/dbcli.py | 10 +- smartsim/_core/_cli/info.py | 7 +- smartsim/_core/_cli/plugin.py | 55 + smartsim/_core/_cli/site.py | 6 +- smartsim/_core/_cli/utils.py | 7 +- smartsim/_core/_cli/validate.py | 11 +- smartsim/_core/config/config.py | 11 + smartsim/_core/control/controller.py | 226 +++- smartsim/_core/control/job.py | 38 +- smartsim/_core/control/jobmanager.py | 22 +- smartsim/_core/control/manifest.py | 145 ++- smartsim/_core/entrypoints/indirect.py | 242 ++++ .../_core/entrypoints/telemetrymonitor.py | 691 ++++++++++ .../_core/launcher/cobalt/cobaltLauncher.py | 5 +- smartsim/_core/launcher/launcher.py | 12 +- smartsim/_core/launcher/local/local.py | 43 +- smartsim/_core/launcher/lsf/lsfLauncher.py | 9 +- smartsim/_core/launcher/pbs/pbsLauncher.py | 5 +- .../_core/launcher/slurm/slurmLauncher.py | 5 +- smartsim/_core/launcher/step/alpsStep.py | 6 +- smartsim/_core/launcher/step/localStep.py | 9 +- smartsim/_core/launcher/step/lsfStep.py | 3 +- smartsim/_core/launcher/step/mpiStep.py | 8 +- smartsim/_core/launcher/step/slurmStep.py | 10 +- smartsim/_core/launcher/step/step.py | 66 +- smartsim/_core/utils/helpers.py | 45 +- smartsim/_core/utils/serialize.py | 246 ++++ smartsim/entity/dbnode.py | 14 +- smartsim/error/errors.py | 30 +- smartsim/experiment.py | 35 + smartsim/log.py | 10 +- smartsim/wlm/slurm.py | 3 +- tests/backends/test_dbmodel.py | 9 +- tests/backends/test_dbscript.py | 8 +- tests/full_wlm/test_generic_batch_launch.py | 12 +- .../full_wlm/test_generic_orc_launch_batch.py | 8 +- tests/full_wlm/test_mpmd.py | 4 +- tests/on_wlm/test_base_settings_on_wlm.py | 8 +- tests/on_wlm/test_colocated_model.py | 38 +- tests/on_wlm/test_generic_orc_launch.py | 6 +- tests/on_wlm/test_launch_errors.py | 6 +- tests/on_wlm/test_launch_ompi_lsf.py | 2 +- tests/on_wlm/test_restart.py | 4 +- .../test_simple_base_settings_on_wlm.py | 8 +- tests/on_wlm/test_simple_entity_launch.py | 12 +- tests/on_wlm/test_stop.py | 8 +- tests/test_cli.py | 130 +- tests/test_colo_model_local.py | 38 +- tests/test_config.py | 54 + tests/test_configs/echo.py | 42 + tests/test_configs/printing_model.py | 18 + .../telemetry/colocatedmodel.json | 69 + .../test_configs/telemetry/db_and_model.json | 86 ++ .../telemetry/db_and_model_1run.json | 79 ++ tests/test_configs/telemetry/ensembles.json | 329 +++++ .../test_configs/telemetry/serialmodels.json | 186 +++ tests/test_configs/telemetry/telemetry.json | 946 ++++++++++++++ tests/test_controller.py | 68 + tests/test_controller_errors.py | 2 +- tests/test_dbnode.py | 2 +- tests/test_experiment.py | 23 +- tests/test_generator.py | 6 +- tests/test_helpers.py | 15 + tests/test_indirect.py | 195 +++ tests/test_launch_errors.py | 4 +- tests/test_local_launch.py | 4 +- tests/test_local_multi_run.py | 2 +- tests/test_local_restart.py | 4 +- tests/test_manifest.py | 73 +- tests/test_model.py | 7 +- tests/test_multidb.py | 15 +- tests/test_orchestrator.py | 4 +- tests/test_pals_settings.py | 13 + tests/test_reconnect_orchestrator.py | 7 +- tests/test_serialize.py | 175 +++ tests/test_telemetry_monitor.py | 1139 +++++++++++++++++ 87 files changed, 5735 insertions(+), 302 deletions(-) create mode 100644 doc/smartdashboard.rst create mode 100644 smartsim/_core/_cli/plugin.py create mode 100644 smartsim/_core/entrypoints/indirect.py create mode 100644 smartsim/_core/entrypoints/telemetrymonitor.py create mode 100644 smartsim/_core/utils/serialize.py create mode 100644 tests/test_configs/echo.py create mode 100644 tests/test_configs/printing_model.py create mode 100644 tests/test_configs/telemetry/colocatedmodel.json create mode 100644 tests/test_configs/telemetry/db_and_model.json create mode 100644 tests/test_configs/telemetry/db_and_model_1run.json create mode 100644 tests/test_configs/telemetry/ensembles.json create mode 100644 tests/test_configs/telemetry/serialmodels.json create mode 100644 tests/test_configs/telemetry/telemetry.json create mode 100644 tests/test_controller.py create mode 100644 tests/test_indirect.py create mode 100644 tests/test_serialize.py create mode 100644 tests/test_telemetry_monitor.py diff --git a/.gitignore b/.gitignore index 3c1f7db48..428e439b3 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,8 @@ smartsim/_core/bin/*-cli # created upon install smartsim/_core/lib + +**/manifest/ +**/*.err +**/*.out +**/.smartsim/* diff --git a/conftest.py b/conftest.py index 69f712d6a..2aab72cd1 100644 --- a/conftest.py +++ b/conftest.py @@ -380,10 +380,10 @@ def local_db( """Yield fixture for startup and teardown of an local orchestrator""" exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir( caller_function=exp_name, caller_fspath=request.fspath ) + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) db = Orchestrator(port=wlmutils.get_test_port(), interface="lo") db.set_path(test_dir) exp.start(db) @@ -402,10 +402,10 @@ def db( launcher = wlmutils.get_test_launcher() exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir( caller_function=exp_name, caller_fspath=request.fspath ) + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) db = wlmutils.get_orchestrator() db.set_path(test_dir) exp.start(db) @@ -427,10 +427,10 @@ def db_cluster( launcher = wlmutils.get_test_launcher() exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir( caller_function=exp_name, caller_fspath=request.fspath ) + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) db = wlmutils.get_orchestrator(nodes=3) db.set_path(test_dir) exp.start(db) @@ -630,7 +630,7 @@ def get_test_dir_path(dirname: str) -> str: return dir_path @staticmethod - def make_test_file(file_name: str, file_dir: t.Optional[str] = None) -> str: + def make_test_file(file_name: str, file_dir: t.Optional[str] = None, file_content: t.Optional[str] = None) -> str: """Create a dummy file in the test output directory. :param file_name: name of file to create, e.g. "file.txt" @@ -644,7 +644,10 @@ def make_test_file(file_name: str, file_dir: t.Optional[str] = None) -> str: file_path = os.path.join(test_dir, file_name) with open(file_path, "w+", encoding="utf-8") as dummy_file: - dummy_file.write("dummy\n") + if not file_content: + dummy_file.write("dummy\n") + else: + dummy_file.write(file_content) return file_path diff --git a/doc/index.rst b/doc/index.rst index d61fdb1ce..13d509257 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -48,6 +48,12 @@ sr_runtime api/smartredis_api +.. toctree:: + :maxdepth: 2 + :caption: SmartDashboard + + smartdashboard + .. toctree:: :maxdepth: 2 :caption: Reference diff --git a/doc/smartdashboard.rst b/doc/smartdashboard.rst new file mode 100644 index 000000000..532fa6db0 --- /dev/null +++ b/doc/smartdashboard.rst @@ -0,0 +1,7 @@ + +************** +SmartDashboard +************** + +.. include:: ../smartdashboard/doc/overview.rst + :start-line: 4 \ No newline at end of file diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile index a27ae03c1..57fee67c9 100644 --- a/docker/docs/dev/Dockerfile +++ b/docker/docs/dev/Dockerfile @@ -52,6 +52,12 @@ RUN git clone https://github.com/CrayLabs/SmartRedis.git --branch develop --dept && python -m pip install . \ && rm -rf ~/.cache/pip +# Install smartdashboard +RUN git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop --depth=1 smartdashboard \ + && cd smartdashboard \ + && python -m pip install . \ + && rm -rf ~/.cache/pip + RUN cd doc/tutorials/ && \ ln -s ../../tutorials/* . diff --git a/pyproject.toml b/pyproject.toml index 24c12d8b6..cd517abb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,7 @@ module = [ "keras", "torch", "smartsim.ml.torch.*", # must solve/ignore inheritance issues + "watchdog", ] ignore_missing_imports = true ignore_errors = true diff --git a/setup.py b/setup.py index 66cc7f879..d38918f68 100644 --- a/setup.py +++ b/setup.py @@ -167,6 +167,7 @@ def has_ext_modules(_placeholder): "tqdm>=4.50.2", "filelock>=3.4.2", "protobuf~=3.20", + "watchdog>=3.0.0", ] # Add SmartRedis at specific version diff --git a/smartsim/_core/_cli/__main__.py b/smartsim/_core/_cli/__main__.py index 68d22d14f..399ca3b03 100644 --- a/smartsim/_core/_cli/__main__.py +++ b/smartsim/_core/_cli/__main__.py @@ -24,14 +24,31 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os import sys from smartsim._core._cli.cli import default_cli +from smartsim._core._cli.utils import SMART_LOGGER_FORMAT +from smartsim.error.errors import SmartSimCLIActionCancelled +from smartsim.log import get_logger + + +logger = get_logger("Smart", fmt=SMART_LOGGER_FORMAT) def main() -> int: smart_cli = default_cli() - return smart_cli.execute(sys.argv) + exception_trace_back_msg = "SmartSim exited with the following exception info:" + + try: + return smart_cli.execute(sys.argv) + except SmartSimCLIActionCancelled as ssi: + logger.info(str(ssi)) + logger.debug(exception_trace_back_msg, exc_info=ssi) + except KeyboardInterrupt as e: + logger.info("SmartSim was terminated by user") + logger.debug(exception_trace_back_msg, exc_info=e) + return os.EX_OK if __name__ == "__main__": diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index b2df26412..e3ba444ad 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -356,7 +356,9 @@ def _format_incompatible_python_env_message( ) -def execute(args: argparse.Namespace) -> int: +def execute( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: verbose = args.v keydb = args.keydb device: _TDeviceStr = args.device @@ -416,7 +418,7 @@ def execute(args: argparse.Namespace) -> int: ) except (SetupError, BuildError) as e: logger.error(str(e)) - return 1 + return os.EX_SOFTWARE backends = installed_redisai_backends() backends_str = ", ".join(s.capitalize() for s in backends) if backends else "No" @@ -431,10 +433,10 @@ def execute(args: argparse.Namespace) -> int: check_py_onnx_version(versions) except (SetupError, BuildError) as e: logger.error(str(e)) - return 1 + return os.EX_SOFTWARE logger.info("SmartSim build complete!") - return 0 + return os.EX_OK def configure_parser(parser: argparse.ArgumentParser) -> None: diff --git a/smartsim/_core/_cli/clean.py b/smartsim/_core/_cli/clean.py index fcf051f0c..d8a85f8a9 100644 --- a/smartsim/_core/_cli/clean.py +++ b/smartsim/_core/_cli/clean.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import typing as t from smartsim._core._cli.utils import clean, get_install_path @@ -39,10 +40,14 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: ) -def execute(args: argparse.Namespace) -> int: +def execute( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: return clean(get_install_path() / "_core", _all=args.clobber) -def execute_all(args: argparse.Namespace) -> int: +def execute_all( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: args.clobber = True return execute(args) diff --git a/smartsim/_core/_cli/cli.py b/smartsim/_core/_cli/cli.py index ce2376c15..3d50765fb 100644 --- a/smartsim/_core/_cli/cli.py +++ b/smartsim/_core/_cli/cli.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import os import typing as t from smartsim._core._cli.build import configure_parser as build_parser @@ -41,46 +42,68 @@ execute as validate_execute, configure_parser as validate_parser, ) +from smartsim._core._cli.plugin import plugins from smartsim._core._cli.utils import MenuItemConfig class SmartCli: def __init__(self, menu: t.List[MenuItemConfig]) -> None: - self.menu: t.Dict[str, MenuItemConfig] = {item.command: item for item in menu} - parser = argparse.ArgumentParser( + self.menu: t.Dict[str, MenuItemConfig] = {} + self.parser = argparse.ArgumentParser( prog="smart", description="SmartSim command line interface", ) - self.parser = parser - self.args: t.Optional[argparse.Namespace] = None - subparsers = parser.add_subparsers( + self.subparsers = self.parser.add_subparsers( dest="command", required=True, metavar="", help="Available commands", ) - for cmd, item in self.menu.items(): - parser = subparsers.add_parser( - cmd, description=item.description, help=item.description - ) - if item.configurator: - item.configurator(parser) + self.register_menu_items(menu) + self.register_menu_items([plugin() for plugin in plugins]) def execute(self, cli_args: t.List[str]) -> int: if len(cli_args) < 2: self.parser.print_help() - return 0 + return os.EX_USAGE - app_args = cli_args[1:] - self.args = self.parser.parse_args(app_args) + app_args = cli_args[1:] # exclude the path to executable + subcommand = cli_args[1] # first positional arg is the subcommand - if not (menu_item := self.menu.get(app_args[0], None)): + menu_item = self.menu.get(subcommand, None) + if not menu_item: self.parser.print_help() - return 0 + return os.EX_USAGE + + args = argparse.Namespace() + unparsed_args = [] + + if menu_item.is_plugin: + unparsed_args = app_args[1:] + else: + args = self.parser.parse_args(app_args) + + return menu_item.handler(args, unparsed_args) + + def _register_menu_item(self, item: MenuItemConfig) -> None: + parser = self.subparsers.add_parser( + item.command, description=item.description, help=item.description + ) + if item.configurator: + item.configurator(parser) + + if item.command in self.menu: + raise ValueError( + f"{item.command} cannot overwrite existing CLI command" + ) + + self.menu[item.command] = item - return menu_item.handler(self.args) + def register_menu_items(self, menu_items: t.List[MenuItemConfig]) -> None: + for item in menu_items: + self._register_menu_item(item) def default_cli() -> SmartCli: diff --git a/smartsim/_core/_cli/dbcli.py b/smartsim/_core/_cli/dbcli.py index 22a376588..ce0975bc4 100644 --- a/smartsim/_core/_cli/dbcli.py +++ b/smartsim/_core/_cli/dbcli.py @@ -25,13 +25,17 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import os +import typing as t from smartsim._core._cli.utils import get_db_path -def execute(_args: argparse.Namespace) -> int: +def execute( + _args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: if db_path := get_db_path(): print(db_path) - return 0 + return os.EX_OK print("Database (Redis or KeyDB) dependencies not found") - return 1 + return os.EX_SOFTWARE diff --git a/smartsim/_core/_cli/info.py b/smartsim/_core/_cli/info.py index 35ee9b9ec..c08fcb1a3 100644 --- a/smartsim/_core/_cli/info.py +++ b/smartsim/_core/_cli/info.py @@ -1,5 +1,6 @@ import argparse import importlib.metadata +import os import pathlib import typing as t @@ -12,7 +13,9 @@ _MISSING_DEP = _helpers.colorize("Not Installed", "red") -def execute(_args: argparse.Namespace, /) -> int: +def execute( + _args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: print("\nSmart Python Packages:") print( tabulate( @@ -66,7 +69,7 @@ def execute(_args: argparse.Namespace, /) -> int: ), end="\n\n", ) - return 0 + return os.EX_OK def _fmt_installed_db(db_path: t.Optional[pathlib.Path]) -> str: diff --git a/smartsim/_core/_cli/plugin.py b/smartsim/_core/_cli/plugin.py new file mode 100644 index 000000000..b263fe8b2 --- /dev/null +++ b/smartsim/_core/_cli/plugin.py @@ -0,0 +1,55 @@ +import argparse +import importlib.util +import os +import sys +import subprocess as sp +import typing as t + +import smartsim.log +from smartsim._core._cli.utils import MenuItemConfig, SMART_LOGGER_FORMAT +from smartsim.error.errors import SmartSimCLIActionCancelled + +_LOGGER = smartsim.log.get_logger("Smart", fmt=SMART_LOGGER_FORMAT) + + +def dynamic_execute( + cmd: str, plugin_name: str +) -> t.Callable[[argparse.Namespace, t.List[str]], int]: + def process_execute( + _args: argparse.Namespace, unparsed_args: t.List[str], / + ) -> int: + try: + spec = importlib.util.find_spec(cmd) + if spec is None: + raise AttributeError + except (ModuleNotFoundError, AttributeError): + _LOGGER.error(f"{cmd} plugin not found. Please ensure it is installed") + return os.EX_CONFIG + + combined_cmd = [sys.executable, "-m", cmd] + unparsed_args + + try: + completed_proc = sp.run(combined_cmd, check=False) + except KeyboardInterrupt as ex: + msg = f"{plugin_name} terminated by user" + raise SmartSimCLIActionCancelled(msg) from ex + return completed_proc.returncode + + return process_execute + + +def dashboard() -> MenuItemConfig: + return MenuItemConfig( + "dashboard", + ( + "Start the SmartSim dashboard to monitor experiment output from a " + "graphical user interface. This requires that the SmartSim Dashboard " + "Package be installed. For more infromation please visit " + "https://github.com/CrayLabs/SmartDashboard" + ), + dynamic_execute("smartdashboard", "Dashboard"), + is_plugin=True, + ) + + +plugins = (dashboard,) diff --git a/smartsim/_core/_cli/site.py b/smartsim/_core/_cli/site.py index 5fe667cde..c86e0341b 100644 --- a/smartsim/_core/_cli/site.py +++ b/smartsim/_core/_cli/site.py @@ -25,10 +25,12 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import os +import typing as t from smartsim._core._cli.utils import get_install_path -def execute(_args: argparse.Namespace) -> int: +def execute(_args: argparse.Namespace, _unparsed_args: t.List[str], /) -> int: print(get_install_path()) - return 0 + return os.EX_OK diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py index 0be1b6ac9..d7b0f410d 100644 --- a/smartsim/_core/_cli/utils.py +++ b/smartsim/_core/_cli/utils.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import importlib.util +import os import shutil import subprocess as sp import sys @@ -110,7 +111,7 @@ def clean(core_path: Path, _all: bool = False) -> int: if removed: logger.info("Successfully removed SmartSim database installation") - return 0 + return os.EX_OK def get_db_path() -> t.Optional[Path]: @@ -121,7 +122,7 @@ def get_db_path() -> t.Optional[Path]: return None -_CliHandler = t.Callable[[Namespace], int] +_CliHandler = t.Callable[[Namespace, t.List[str]], int] _CliParseConfigurator = t.Callable[[ArgumentParser], None] @@ -132,8 +133,10 @@ def __init__( description: str, handler: _CliHandler, configurator: t.Optional[_CliParseConfigurator] = None, + is_plugin: bool = False ): self.command = cmd self.description = description self.handler = handler self.configurator = configurator + self.is_plugin = is_plugin diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 78db15516..c796fc616 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -82,7 +82,9 @@ def __exit__( self._finalizer.detach() # type: ignore[attr-defined] -def execute(args: argparse.Namespace, /) -> int: +def execute( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: """Validate the SmartSim installation works as expected given a simple experiment """ @@ -101,10 +103,10 @@ def execute(args: argparse.Namespace, /) -> int: logger.error( "SmartSim failed to run a simple experiment!\n" f"Experiment failed due to the following exception:\n{e}\n\n" - f"Output files are available at `{temp_dir}`" + f"Output files are available at `{temp_dir}`", exc_info=True ) - return 2 - return 0 + return os.EX_SOFTWARE + return os.EX_OK def configure_parser(parser: argparse.ArgumentParser) -> None: @@ -138,6 +140,7 @@ def test_install( with_onnx: bool, ) -> None: exp = Experiment("ValidationExperiment", exp_path=location, launcher="local") + exp.disable_telemetry() port = _find_free_port() if port is None else port with _make_managed_local_orc(exp, port) as client: logger.info("Verifying Tensor Transfer") diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index a7b1471bf..2fcee90f5 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -204,6 +204,17 @@ def test_account(self) -> t.Optional[str]: # pragma: no cover # no account by default return os.environ.get("SMARTSIM_TEST_ACCOUNT", None) + @property + def telemetry_frequency(self) -> int: + return int(os.environ.get("SMARTSIM_TELEMETRY_FREQUENCY", 5)) + + @property + def telemetry_enabled(self) -> bool: + return int(os.environ.get("SMARTSIM_FLAG_TELEMETRY", "0")) > 0 + + @property + def telemetry_cooldown(self) -> int: + return int(os.environ.get("SMARTSIM_TELEMETRY_COOLDOWN", 90)) @lru_cache(maxsize=128, typed=False) def get_config() -> Config: diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 08c9f2bd8..62c5a155e 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -27,34 +27,39 @@ from __future__ import annotations import os.path as osp -from os import environ +import pathlib import pickle import signal +import subprocess +import sys import threading import time import typing as t +from os import environ from smartredis import Client, ConfigOptions from ..._core.launcher.step import Step +from ..._core.utils.helpers import unpack_colo_db_identifier, unpack_db_identifier from ..._core.utils.redis import db_is_active, set_ml_model, set_script, shutdown_db -from ..._core.utils.helpers import ( - unpack_db_identifier, - unpack_colo_db_identifier, -) from ...database import Orchestrator -from ...entity import Ensemble, EntityList, EntitySequence, Model, SmartSimEntity +from ...entity import ( + Ensemble, + EntityList, + EntitySequence, + Model, + SmartSimEntity, +) from ...error import ( LauncherError, SmartSimError, + SSDBIDConflictError, SSInternalError, SSUnsupportedError, - SSDBIDConflictError, ) from ...log import get_logger -from ...settings.base import BatchSettings +from ...servertype import CLUSTERED, STANDALONE from ...status import STATUS_CANCELLED, STATUS_RUNNING, TERMINAL_STATUSES -from ...servertype import STANDALONE, CLUSTERED from ..config import CONFIG from ..launcher import ( CobaltLauncher, @@ -64,10 +69,14 @@ SlurmLauncher, ) from ..launcher.launcher import Launcher -from ..utils import check_cluster_status, create_cluster +from ..utils import check_cluster_status, create_cluster, serialize from .job import Job from .jobmanager import JobManager -from .manifest import Manifest +from .manifest import LaunchedManifest, LaunchedManifestBuilder, Manifest + +if t.TYPE_CHECKING: + from ..utils.serialize import TStepLaunchMetaData + logger = get_logger(__name__) @@ -89,9 +98,15 @@ def __init__(self, launcher: str = "local") -> None: """ self._jobs = JobManager(JM_LOCK) self.init_launcher(launcher) + self._telemetry_monitor: t.Optional[subprocess.Popen[bytes]] = None def start( - self, manifest: Manifest, block: bool = True, kill_on_interrupt: bool = True + self, + exp_name: str, + exp_path: str, + manifest: Manifest, + block: bool = True, + kill_on_interrupt: bool = True, ) -> None: """Start the passed SmartSim entities @@ -104,12 +119,20 @@ def start( self._jobs.kill_on_interrupt = kill_on_interrupt # register custom signal handler for ^C (SIGINT) signal.signal(signal.SIGINT, self._jobs.signal_interrupt) - self._launch(manifest) + launched = self._launch(exp_name, exp_path, manifest) # start the job manager thread if not already started if not self._jobs.actively_monitoring: self._jobs.start() + serialize.save_launch_manifest( + launched.map(_look_up_launched_data(self._launcher)) + ) + + # launch a telemetry monitor to track job progress + if CONFIG.telemetry_enabled: + self._start_telemetry_monitor(exp_path) + # block until all non-database jobs are complete if block: # poll handles its own keyboard interrupt as @@ -312,16 +335,25 @@ def init_launcher(self, launcher: str) -> None: else: raise TypeError("Must provide a 'launcher' argument") - def _launch(self, manifest: Manifest) -> None: + def _launch( + self, exp_name: str, exp_path: str, manifest: Manifest + ) -> LaunchedManifest[t.Tuple[str, Step]]: """Main launching function of the controller Orchestrators are always launched first so that the address of the database can be given to following entities + :param exp_name: The name of the launching experiment + :type exp_name: str + :param exp_path: path to location of ``Experiment`` directory if generated + :type exp_path: str :param manifest: Manifest of deployables to launch :type manifest: Manifest """ + manifest_builder = LaunchedManifestBuilder[t.Tuple[str, Step]]( + exp_name=exp_name, exp_path=exp_path, launcher_name=str(self._launcher) + ) # Loop over deployables to launch and launch multiple orchestrators for orchestrator in manifest.dbs: for key in self._jobs.get_db_host_addresses(): @@ -339,7 +371,7 @@ def _launch(self, manifest: Manifest) -> None: raise SmartSimError( "Local launcher does not support multi-host orchestrators" ) - self._launch_orchestrator(orchestrator) + self._launch_orchestrator(orchestrator, manifest_builder) if self.orchestrator_active: self._set_dbobjects(manifest) @@ -348,33 +380,51 @@ def _launch(self, manifest: Manifest) -> None: steps: t.List[ t.Tuple[Step, t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]]] ] = [] - all_entity_lists = manifest.ensembles - for elist in all_entity_lists: + for elist in manifest.ensembles: + ens_telem_dir = manifest_builder.run_telemetry_subdirectory / "ensemble" if elist.batch: - batch_step = self._create_batch_job_step(elist) + batch_step, substeps = self._create_batch_job_step(elist, ens_telem_dir) + manifest_builder.add_ensemble( + elist, [(batch_step.name, step) for step in substeps] + ) steps.append((batch_step, elist)) else: - job_steps = [(self._create_job_step(e), e) for e in elist.entities] + # if ensemble is to be run as separate job steps, aka not in a batch + job_steps = [ + (self._create_job_step(e, ens_telem_dir / elist.name), e) + for e in elist.entities + ] + manifest_builder.add_ensemble( + elist, [(step.name, step) for step, _ in job_steps] + ) steps.extend(job_steps) # models themselves cannot be batch steps. If batch settings are # attached, wrap them in an anonymous batch job step for model in manifest.models: + model_telem_dir = manifest_builder.run_telemetry_subdirectory / "model" if model.batch_settings: - anon_entity_list = _AnonymousBatchJob( - model.name, model.path, model.batch_settings + anon_entity_list = _AnonymousBatchJob(model) + batch_step, _ = self._create_batch_job_step( + anon_entity_list, model_telem_dir ) - anon_entity_list.entities.append(model) - batch_step = self._create_batch_job_step(anon_entity_list) + manifest_builder.add_model(model, (batch_step.name, batch_step)) steps.append((batch_step, model)) else: - job_step = self._create_job_step(model) + job_step = self._create_job_step(model, model_telem_dir) + manifest_builder.add_model(model, (job_step.name, job_step)) steps.append((job_step, model)) # launch steps for step, entity in steps: self._launch_step(step, entity) - def _launch_orchestrator(self, orchestrator: Orchestrator) -> None: + return manifest_builder.finalize() + + def _launch_orchestrator( + self, + orchestrator: Orchestrator, + manifest_builder: LaunchedManifestBuilder[t.Tuple[str, Step]], + ) -> None: """Launch an Orchestrator instance This function will launch the Orchestrator instance and @@ -383,16 +433,32 @@ def _launch_orchestrator(self, orchestrator: Orchestrator) -> None: :param orchestrator: orchestrator to launch :type orchestrator: Orchestrator + :param manifest_builder: An `LaunchedManifestBuilder` to record the + names and `Step`s of the launched orchestrator + :type manifest_builder: LaunchedManifestBuilder[tuple[str, Step]] """ orchestrator.remove_stale_files() + orc_telem_dir = manifest_builder.run_telemetry_subdirectory / "database" + # if the orchestrator was launched as a batch workload if orchestrator.batch: - orc_batch_step = self._create_batch_job_step(orchestrator) + orc_batch_step, substeps = self._create_batch_job_step( + orchestrator, orc_telem_dir + ) + manifest_builder.add_database( + orchestrator, [(orc_batch_step.name, step) for step in substeps] + ) self._launch_step(orc_batch_step, orchestrator) # if orchestrator was run on existing allocation, locally, or in allocation else: - db_steps = [(self._create_job_step(db), db) for db in orchestrator.entities] + db_steps = [ + (self._create_job_step(db, orc_telem_dir / orchestrator.name), db) + for db in orchestrator.entities + ] + manifest_builder.add_database( + orchestrator, [(step.name, step) for step, _ in db_steps] + ) for db_step in db_steps: self._launch_step(*db_step) @@ -462,35 +528,52 @@ def _launch_step( self._jobs.add_job(job_step.name, job_id, entity, is_task) def _create_batch_job_step( - self, entity_list: t.Union[Orchestrator, Ensemble, _AnonymousBatchJob] - ) -> Step: + self, + entity_list: t.Union[Orchestrator, Ensemble, _AnonymousBatchJob], + telemetry_dir: pathlib.Path, + ) -> t.Tuple[Step, t.List[Step]]: """Use launcher to create batch job step :param entity_list: EntityList to launch as batch :type entity_list: EntityList - :return: job step instance - :rtype: Step + :param telemetry_dir: Path to a directory in which the batch job step + may write telemetry events + :type telemetry_dir: pathlib.Path + :return: batch job step instance and a list of run steps to be + executed within the batch job + :rtype: tuple[Step, list[Step]] """ if not entity_list.batch_settings: raise ValueError( "EntityList must have batch settings to be launched as batch" ) + telemetry_dir = telemetry_dir / entity_list.name batch_step = self._launcher.create_step( entity_list.name, entity_list.path, entity_list.batch_settings ) + batch_step.meta["entity_type"] = str(type(entity_list).__name__).lower() + batch_step.meta["status_dir"] = str(telemetry_dir / entity_list.name) + + substeps = [] for entity in entity_list.entities: # tells step creation not to look for an allocation entity.run_settings.in_batch = True - step = self._create_job_step(entity) + step = self._create_job_step(entity, telemetry_dir) + substeps.append(step) batch_step.add_to_batch(step) - return batch_step + return batch_step, substeps - def _create_job_step(self, entity: SmartSimEntity) -> Step: + def _create_job_step( + self, entity: SmartSimEntity, telemetry_dir: pathlib.Path + ) -> Step: """Create job steps for all entities with the launcher :param entity: an entity to create a step for :type entity: SmartSimEntity + :param telemetry_dir: Path to a directory in which the job step + may write telemetry events + :type telemetry_dir: pathlib.Path :return: the job step :rtype: Step """ @@ -499,6 +582,10 @@ def _create_job_step(self, entity: SmartSimEntity) -> Step: self._prep_entity_client_env(entity) step = self._launcher.create_step(entity.name, entity.path, entity.run_settings) + + step.meta["entity_type"] = str(type(entity).__name__).lower() + step.meta["status_dir"] = str(telemetry_dir / entity.name) + return step def _prep_entity_client_env(self, entity: Model) -> None: @@ -739,13 +826,74 @@ def _set_dbobjects(self, manifest: Manifest) -> None: if db_script not in ensemble.db_scripts: set_script(db_script, client) + def _start_telemetry_monitor(self, exp_dir: str) -> None: + """Spawns a telemetry monitor process to keep track of the life times + of the processes launched through this controller. + + :param exp_dir: An experiment directory + :type exp_dir: str + """ + logger.debug("Starting telemetry monitor process") + if ( + self._telemetry_monitor is None + or self._telemetry_monitor.returncode is not None + ): + cmd = [ + sys.executable, + "-m", + "smartsim._core.entrypoints.telemetrymonitor", + "-exp_dir", + exp_dir, + "-frequency", + str(CONFIG.telemetry_frequency), + "-cooldown", + str(CONFIG.telemetry_cooldown), + ] + # pylint: disable-next=consider-using-with + self._telemetry_monitor = subprocess.Popen( + cmd, + stderr=sys.stderr, + stdout=sys.stdout, + cwd=str(pathlib.Path(__file__).parent.parent.parent), + shell=False, + ) + class _AnonymousBatchJob(EntityList[Model]): - def __init__( - self, name: str, path: str, batch_settings: BatchSettings, **kwargs: t.Any - ) -> None: - super().__init__(name, path) - self.batch_settings = batch_settings + @staticmethod + def _validate(model: Model) -> None: + if model.batch_settings is None: + msg = "Unable to create _AnonymousBatchJob without batch_settings" + raise SmartSimError(msg) + + def __init__(self, model: Model) -> None: + self._validate(model) + super().__init__(model.name, model.path) + self.entities = [model] + self.batch_settings = model.batch_settings def _initialize_entities(self, **kwargs: t.Any) -> None: ... + + +def _look_up_launched_data( + launcher: Launcher, +) -> t.Callable[[t.Tuple[str, Step]], "TStepLaunchMetaData"]: + def _unpack_launched_data(data: t.Tuple[str, Step]) -> "TStepLaunchMetaData": + # NOTE: we cannot assume that the name of the launched step + # ``launched_step_name`` is equal to the name of the step referring to + # the entity ``step.name`` as is the case when an entity list is + # launched as a batch job + launched_step_name, step = data + launched_step_map = launcher.step_mapping[launched_step_name] + out_file, err_file = step.get_output_files() + return ( + launched_step_map.step_id, + launched_step_map.task_id, + launched_step_map.managed, + out_file, + err_file, + pathlib.Path(step.meta.get("status_dir", step.cwd)), + ) + + return _unpack_launched_data diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index 2842c3c14..3a54c0d00 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -27,10 +27,44 @@ import time import typing as t +from dataclasses import dataclass from ...entity import SmartSimEntity, EntitySequence from ...status import STATUS_NEW +@dataclass(frozen=True) +class _JobKey(): + step_id: str + task_id: str + + +class JobEntity: + """API required for a job processed in the JobManager with support for + telemetry monitoring + """ + + def __init__(self) -> None: + self.name: str = "" + self.path: str = "" + self.step_id: str = "" + self.task_id: str = "" + self.type: str = "" + self.timestamp: int = 0 + self.status_dir: str = "" + + @property + def is_db(self) -> bool: + return self.type in ["orchestrator", "dbnode"] + + @property + def is_managed(self) -> bool: + return bool(self.step_id) + + @property + def key(self) -> _JobKey: + return _JobKey(self.step_id, self.task_id) + + class Job: """Keep track of various information for the controller. In doing so, continuously add various fields of information @@ -42,7 +76,7 @@ def __init__( self, job_name: str, job_id: t.Optional[str], - entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], + entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity], launcher: str, is_task: bool, ) -> None: @@ -53,7 +87,7 @@ def __init__( :param job_id: The id associated with the job :type job_id: str :param entity: The SmartSim entity(list) associated with the job - :type entity: SmartSimEntity | EntitySequence + :type entity: SmartSimEntity | EntitySequence | JobEntity :param launcher: Launcher job was started with :type launcher: str :param is_task: process monitored by TaskManager (True) or the WLM (True) diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index 022a2c85c..90eedd229 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -24,9 +24,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import itertools import time import typing as t +from collections import ChainMap from threading import Thread, RLock from types import FrameType @@ -38,7 +40,8 @@ from ..config import CONFIG from ..launcher import LocalLauncher, Launcher from ..utils.network import get_ip_from_host -from .job import Job +from .job import Job, JobEntity + logger = get_logger(__name__) @@ -145,13 +148,8 @@ def __getitem__(self, entity_name: str) -> Job: :rtype: Job """ with self._lock: - if entity_name in self.db_jobs: - return self.db_jobs[entity_name] - if entity_name in self.jobs: - return self.jobs[entity_name] - if entity_name in self.completed: - return self.completed[entity_name] - raise KeyError + entities = ChainMap(self.db_jobs, self.jobs, self.completed) + return entities[entity_name] def __call__(self) -> t.Dict[str, Job]: """Returns dictionary all jobs for () operator @@ -166,7 +164,7 @@ def add_job( self, job_name: str, job_id: t.Optional[str], - entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], + entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity], is_task: bool = True, ) -> None: """Add a job to the job manager which holds specific jobs by type. @@ -185,7 +183,8 @@ def add_job( job = Job(job_name, job_id, entity, launcher, is_task) if isinstance(entity, (DBNode, Orchestrator)): self.db_jobs[entity.name] = job - + elif isinstance(entity, JobEntity) and entity.is_db: + self.db_jobs[entity.name] = job else: self.jobs[entity.name] = job @@ -310,7 +309,8 @@ def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: for corresponding database identifiers :return: dictionary of host ip addresses - :rtype: Dict[str, list]""" + :rtype: Dict[str, list] + """ address_dict = {} for db_job in self.db_jobs.values(): diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 65aa8a898..ec1d79165 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -24,12 +24,22 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pathlib import typing as t +from dataclasses import dataclass, field from ...database import Orchestrator -from ...entity import EntitySequence, SmartSimEntity, Model, Ensemble +from ...entity import DBNode, Ensemble, EntitySequence, Model, SmartSimEntity from ...error import SmartSimError -from ..utils.helpers import fmt_dict +from ..utils import helpers as _helpers +from ..utils import serialize as _serialize + +_T = t.TypeVar("_T") +_U = t.TypeVar("_U") +_AtomicLaunchableT = t.TypeVar("_AtomicLaunchableT", Model, DBNode) + +if t.TYPE_CHECKING: + import os class Manifest: @@ -92,7 +102,6 @@ def all_entity_lists(self) -> t.List[EntitySequence[SmartSimEntity]]: """ _all_entity_lists: t.List[EntitySequence[SmartSimEntity]] = list(self.ensembles) - for db in self.dbs: _all_entity_lists.append(db) @@ -150,7 +159,7 @@ def __str__(self) -> str: output += f"{model.batch_settings}\n" output += f"{model.run_settings}\n" if model.params: - output += f"Parameters: \n{fmt_dict(model.params)}\n" + output += f"Parameters: \n{_helpers.fmt_dict(model.params)}\n" output += "\n" for adb in self.dbs: @@ -214,3 +223,131 @@ def has_db_scripts( # `has_db_objects` should be False here return has_db_objects + + + +class _LaunchedManifestMetadata(t.NamedTuple): + run_id: str + exp_name: str + exp_path: str + launcher_name: str + + @property + def exp_telemetry_subdirectory(self) -> pathlib.Path: + return _format_exp_telemetry_path(self.exp_path) + + @property + def run_telemetry_subdirectory(self) -> pathlib.Path: + return _format_run_telemetry_path(self.exp_path, self.exp_name, self.run_id) + + @property + def manifest_file_path(self) -> pathlib.Path: + return self.exp_telemetry_subdirectory / _serialize.MANIFEST_FILENAME + + +@dataclass(frozen=True) +class LaunchedManifest(t.Generic[_T]): + """Immutable manifest mapping launched entities or collections of launched + entities to other pieces of external data. This is commonly used to map a + launch-able entity to its constructed ``Step`` instance without assuming + that ``step.name == job.name`` or querying the ``JobManager`` which itself + can be ephemeral. + """ + + metadata: _LaunchedManifestMetadata + models: t.Tuple[t.Tuple[Model, _T], ...] + ensembles: t.Tuple[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]], ...] + databases: t.Tuple[t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]], ...] + + def map(self, func: t.Callable[[_T], _U]) -> "LaunchedManifest[_U]": + def _map_entity_data( + fn: t.Callable[[_T], _U], + entity_list: t.Sequence[t.Tuple[_AtomicLaunchableT, _T]], + ) -> t.Tuple[t.Tuple[_AtomicLaunchableT, _U], ...]: + return tuple((entity, fn(data)) for entity, data in entity_list) + + return LaunchedManifest( + metadata=self.metadata, + models=_map_entity_data(func, self.models), + ensembles=tuple( + (ens, _map_entity_data(func, model_data)) + for ens, model_data in self.ensembles + ), + databases=tuple( + (db_, _map_entity_data(func, node_data)) + for db_, node_data in self.databases + ), + ) + + +@dataclass(frozen=True) +class LaunchedManifestBuilder(t.Generic[_T]): + """A class comprised of mutable collections of SmartSim entities that is + used to build a ``LaunchedManifest`` while going through the launching + process. + """ + + exp_name: str + exp_path: str + launcher_name: str + run_id: str = field(default_factory=_helpers.create_short_id_str) + + _models: t.List[t.Tuple[Model, _T]] = field(default_factory=list, init=False) + _ensembles: t.List[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]]] = field( + default_factory=list, init=False + ) + _databases: t.List[ + t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]] + ] = field(default_factory=list, init=False) + + @property + def exp_telemetry_subdirectory(self) -> pathlib.Path: + return _format_exp_telemetry_path(self.exp_path) + + @property + def run_telemetry_subdirectory(self) -> pathlib.Path: + return _format_run_telemetry_path(self.exp_path, self.exp_name, self.run_id) + + def add_model(self, model: Model, data: _T) -> None: + self._models.append((model, data)) + + def add_ensemble(self, ens: Ensemble, data: t.Sequence[_T]) -> None: + self._ensembles.append((ens, self._entities_to_data(ens.entities, data))) + + def add_database(self, db_: Orchestrator, data: t.Sequence[_T]) -> None: + self._databases.append((db_, self._entities_to_data(db_.entities, data))) + + @staticmethod + def _entities_to_data( + entities: t.Sequence[_AtomicLaunchableT], data: t.Sequence[_T] + ) -> t.Tuple[t.Tuple[_AtomicLaunchableT, _T], ...]: + if not entities: + raise ValueError("Cannot map data to an empty entity sequence") + if len(entities) != len(data): + raise ValueError( + f"Cannot map data sequence of length {len(data)} to entity " + f"sequence of length {len(entities)}" + ) + return tuple(zip(entities, data)) + + def finalize(self) -> LaunchedManifest[_T]: + return LaunchedManifest( + metadata=_LaunchedManifestMetadata( + self.run_id, self.exp_name, self.exp_path, self.launcher_name + ), + models=tuple(self._models), + ensembles=tuple(self._ensembles), + databases=tuple(self._databases), + ) + + +def _format_exp_telemetry_path( + exp_path: t.Union[str, "os.PathLike[str]"] +) -> pathlib.Path: + return pathlib.Path(exp_path, _serialize.TELMON_SUBDIR) + + +def _format_run_telemetry_path( + exp_path: t.Union[str, "os.PathLike[str]"], exp_name: str, run_id: str +) -> pathlib.Path: + return _format_exp_telemetry_path(exp_path) / f"{exp_name}/{run_id}" diff --git a/smartsim/_core/entrypoints/indirect.py b/smartsim/_core/entrypoints/indirect.py new file mode 100644 index 000000000..18d27601f --- /dev/null +++ b/smartsim/_core/entrypoints/indirect.py @@ -0,0 +1,242 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import logging +import os +import pathlib +import signal +import sys +import typing as t +from types import FrameType + +import coloredlogs +import psutil + +import smartsim.log +from smartsim._core.entrypoints.telemetrymonitor import track_event +from smartsim._core.utils.helpers import decode_cmd, get_ts + +STEP_PID: t.Optional[int] = None +logger = smartsim.log.get_logger(__name__) + +# kill is not catchable +SIGNALS = [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT, signal.SIGABRT] + + +def main( + cmd: str, + etype: str, + cwd: str, + status_dir: str, +) -> int: + """The main function of the entrypoint. This function takes an encoded step + command and runs it in a subprocess. In the background, this entrypoint + will then monitor the subprocess and write out status events such as when + the subprocess has started or stopped and write these events to a status + directory. + """ + global STEP_PID # pylint: disable=global-statement + proxy_pid = os.getpid() + + status_path = pathlib.Path(status_dir) + if not status_path.exists(): + status_path.mkdir(parents=True, exist_ok=True) + + if not cmd.strip(): + raise ValueError("Invalid cmd supplied") + + cleaned_cmd = decode_cmd(cmd) + ret_code: int = 1 + logger.debug("Indirect step starting") + + start_detail = f"Proxy process {proxy_pid}" + start_rc: t.Optional[int] = None + + try: + process = psutil.Popen( + cleaned_cmd, + cwd=cwd, + stdout=sys.stdout, + stderr=sys.stderr, + ) + STEP_PID = process.pid + logger.info(f"Indirect proxy {proxy_pid} child process {STEP_PID} started") + start_detail += f" started child process {STEP_PID}" + + except Exception as ex: + start_detail += f" failed to start child process. {ex}" + start_rc = 1 + logger.error("Failed to create process", exc_info=True) + cleanup() + return 1 + finally: + track_event( + get_ts(), + proxy_pid, + "", # step_id for unmanaged task is always empty + etype, + "start", + status_path, + logger, + detail=start_detail, + return_code=start_rc, + ) + + logger.info(f"Waiting for child process {STEP_PID} to complete") + ret_code = process.wait() + + logger.info( + f"Indirect proxy {proxy_pid} child process {STEP_PID} complete." + f" return code: {ret_code}" + ) + msg = f"Process {STEP_PID} finished with return code: {ret_code}" + track_event( + get_ts(), + proxy_pid, + "", # step_id for unmanaged task is always empty + etype, + "stop", + status_path, + logger, + detail=msg, + return_code=ret_code, + ) + cleanup() + + return ret_code + + +def cleanup() -> None: + """Perform cleanup required for clean termination""" + logger.info("Performing cleanup") + global STEP_PID # pylint: disable=global-statement + if STEP_PID is None: + return + + try: + # attempt to stop the subprocess performing step-execution + if psutil.pid_exists(STEP_PID): + process = psutil.Process(STEP_PID) + process.terminate() + except psutil.NoSuchProcess: + # swallow exception to avoid overwriting outputs from cmd + ... + + except OSError as ex: + logger.warning(f"Failed to clean up step executor gracefully: {ex}") + finally: + STEP_PID = None + + +def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: + """Helper function to ensure clean process termination""" + logger.info(f"handling signal {signo}") + if not signo: + logger.warning("Received signal with no signo") + + cleanup() + + +def register_signal_handlers() -> None: + """Register a signal handling function for all termination events""" + for sig in SIGNALS: + signal.signal(sig, handle_signal) + + +def get_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prefix_chars="+", description="SmartSim Step Executor" + ) + parser.add_argument( + "+name", type=str, help="Name of the step being executed", required=True + ) + parser.add_argument( + "+command", type=str, help="The command to execute", required=True + ) + parser.add_argument( + "+entity_type", + type=str, + help="The type of entity related to the step", + required=True, + ) + parser.add_argument( + "+working_dir", + type=str, + help="The working directory of the executable", + required=True, + ) + parser.add_argument( + "+telemetry_dir", + type=str, + help="Directory for telemetry output", + required=True, + ) + return parser + + +if __name__ == "__main__": + arg_parser = get_parser() + os.environ["PYTHONUNBUFFERED"] = "1" + parsed_args = arg_parser.parse_args() + + # Set up a local private logger for when this module is run as an entry point + level = logger.getEffectiveLevel() + logger = logging.getLogger(f"{__name__}.{parsed_args.name}") + logger.propagate = False + logger.setLevel(level) + + fh = logging.FileHandler(f"{parsed_args.name}.indirect.log") + coloredlogs.HostNameFilter.install(fh) + fh.setFormatter( + logging.Formatter( + smartsim.log.DEFAULT_LOG_FORMAT, + datefmt=smartsim.log.DEFAULT_DATE_FORMAT, + ) + ) + logger.addHandler(fh) + + try: + logger.debug("Starting indirect step execution") + + # make sure to register the cleanup before the start the process + # so our signaller will be able to stop the database process. + register_signal_handlers() + + rc = main( + cmd=parsed_args.command, + etype=parsed_args.entity_type, + cwd=parsed_args.working_dir, + status_dir=parsed_args.telemetry_dir, + ) + sys.exit(rc) + + # gracefully exit the processes in the distributed application that + # we do not want to have start a colocated process. Only one process + # per node should be running. + except Exception as e: + logger.exception(f"An unexpected error caused step execution to fail: {e}") + sys.exit(1) diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py new file mode 100644 index 000000000..cb80e6918 --- /dev/null +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -0,0 +1,691 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import json +import logging +import os +import pathlib +import signal +import sys +import threading +import time +import typing as t + +from dataclasses import dataclass, field +from types import FrameType + +from watchdog.observers import Observer +from watchdog.observers.api import BaseObserver +from watchdog.events import PatternMatchingEventHandler, LoggingEventHandler +from watchdog.events import FileCreatedEvent, FileModifiedEvent + +from smartsim._core.config import CONFIG +from smartsim._core.control.job import JobEntity, _JobKey +from smartsim._core.control.jobmanager import JobManager +from smartsim._core.launcher.stepInfo import StepInfo + + +from smartsim._core.launcher.cobalt.cobaltLauncher import CobaltLauncher +from smartsim._core.launcher.launcher import Launcher +from smartsim._core.launcher.local.local import LocalLauncher +from smartsim._core.launcher.lsf.lsfLauncher import LSFLauncher +from smartsim._core.launcher.pbs.pbsLauncher import PBSLauncher +from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher +from smartsim._core.utils.helpers import get_ts +from smartsim._core.utils.serialize import TELMON_SUBDIR, MANIFEST_FILENAME + +from smartsim.error.errors import SmartSimError +from smartsim.status import STATUS_COMPLETED, TERMINAL_STATUSES + + +"""Telemetry Monitor entrypoint""" + +# kill is not catchable +SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] +_EventClass = t.Literal["start", "stop", "timestep"] +_MAX_MANIFEST_LOAD_ATTEMPTS: t.Final[int] = 6 + + +@dataclass +class Run: + """Model containing entities of an individual start call for an experiment""" + + timestamp: int + models: t.List[JobEntity] + orchestrators: t.List[JobEntity] + ensembles: t.List[JobEntity] + + def flatten( + self, filter_fn: t.Optional[t.Callable[[JobEntity], bool]] = None + ) -> t.List[JobEntity]: + """Flatten runs into a list of SmartSimEntity run events""" + entities = self.models + self.orchestrators + self.ensembles + if filter_fn: + entities = [entity for entity in entities if filter_fn(entity)] + return entities + + +@dataclass +class RuntimeManifest: + """The runtime manifest holds meta information about the experiment entities created + at runtime to satisfy the experiment requirements. + """ + + name: str + path: pathlib.Path + launcher: str + runs: t.List[Run] = field(default_factory=list) + + +def _hydrate_persistable( + persistable_entity: t.Dict[str, t.Any], + entity_type: str, + exp_dir: str, +) -> JobEntity: + """Populate JobEntity instance with supplied metdata and instance details""" + entity = JobEntity() + + metadata = persistable_entity["telemetry_metadata"] + status_dir = pathlib.Path(metadata.get("status_dir")) + + entity.type = entity_type + entity.name = persistable_entity["name"] + entity.step_id = str(metadata.get("step_id") or "") + entity.task_id = str(metadata.get("task_id") or "") + entity.timestamp = int(persistable_entity.get("timestamp", "0")) + entity.path = str(exp_dir) + entity.status_dir = str(status_dir) + + return entity + + +def hydrate_persistable( + entity_type: str, + persistable_entity: t.Dict[str, t.Any], + exp_dir: pathlib.Path, +) -> t.List[JobEntity]: + """Map entity data persisted in a manifest file to an object""" + entities = [] + + # an entity w/parent key creates persistables for entities it contains + parent_keys = {"shards", "models"} + parent_keys = parent_keys.intersection(persistable_entity.keys()) + if parent_keys: + container = "shards" if "shards" in parent_keys else "models" + child_type = "orchestrator" if container == "shards" else "model" + for child_entity in persistable_entity[container]: + entity = _hydrate_persistable(child_entity, child_type, str(exp_dir)) + entities.append(entity) + + return entities + + entity = _hydrate_persistable(persistable_entity, entity_type, str(exp_dir)) + entities.append(entity) + return entities + + +def hydrate_persistables( + entity_type: str, + run: t.Dict[str, t.Any], + exp_dir: pathlib.Path, +) -> t.Dict[str, t.List[JobEntity]]: + """Map a collection of entity data persisted in a manifest file to an object""" + persisted: t.Dict[str, t.List[JobEntity]] = { + "model": [], + "orchestrator": [], + } + for item in run[entity_type]: + entities = hydrate_persistable(entity_type, item, exp_dir) + for new_entity in entities: + persisted[new_entity.type].append(new_entity) + + return persisted + + +def hydrate_runs( + persisted_runs: t.List[t.Dict[str, t.Any]], exp_dir: pathlib.Path +) -> t.List[Run]: + """Map run data persisted in a manifest file to an object""" + the_runs: t.List[Run] = [] + for run_instance in persisted_runs: + run_entities: t.Dict[str, t.List[JobEntity]] = { + "model": [], + "orchestrator": [], + "ensemble": [], + } + + for key in run_entities: + _entities = hydrate_persistables(key, run_instance, exp_dir) + for entity_type, new_entities in _entities.items(): + if new_entities: + run_entities[entity_type].extend(new_entities) + + run = Run( + run_instance["timestamp"], + run_entities["model"], + run_entities["orchestrator"], + run_entities["ensemble"], + ) + the_runs.append(run) + + return the_runs + + +def load_manifest(file_path: str) -> t.Optional[RuntimeManifest]: + """Load a persisted manifest and return the content""" + manifest_dict: t.Optional[t.Dict[str, t.Any]] = None + try_count = 1 + + while manifest_dict is None and try_count < _MAX_MANIFEST_LOAD_ATTEMPTS: + source = pathlib.Path(file_path) + source = source.resolve() + + try: + if text := source.read_text(encoding="utf-8").strip(): + manifest_dict = json.loads(text) + except json.JSONDecodeError as ex: + print(f"Error loading manifest: {ex}") + # hack/fix: handle issues reading file before it is fully written + time.sleep(0.5 * try_count) + finally: + try_count += 1 + + if not manifest_dict: + return None + + exp = manifest_dict.get("experiment", None) + if not exp: + raise ValueError("Manifest missing required experiment") + + runs = manifest_dict.get("runs", None) + if runs is None: + raise ValueError("Manifest missing required runs") + + exp_dir = pathlib.Path(exp["path"]) + runs = hydrate_runs(runs, exp_dir) + + manifest = RuntimeManifest( + name=exp["name"], + path=exp_dir, + launcher=exp["launcher"], + runs=runs, + ) + return manifest + + +def track_event( + timestamp: int, + task_id: t.Union[int, str], + step_id: str, + etype: str, + action: _EventClass, + status_dir: pathlib.Path, + logger: logging.Logger, + detail: str = "", + return_code: t.Optional[int] = None, +) -> None: + """Persist a tracking event for an entity""" + tgt_path = status_dir / f"{action}.json" + tgt_path.parent.mkdir(parents=True, exist_ok=True) + + try: + task_id = int(task_id) + except ValueError: + pass + + entity_dict = { + "timestamp": timestamp, + "job_id": task_id, + "step_id": step_id, + "type": etype, + "action": action, + } + + if detail is not None: + entity_dict["detail"] = detail + + if return_code is not None: + entity_dict["return_code"] = return_code + + try: + if not tgt_path.exists(): + # Don't overwrite existing tracking files + bytes_written = tgt_path.write_text(json.dumps(entity_dict, indent=2)) + if bytes_written < 1: + logger.warning("event tracking failed to write tracking file.") + except Exception: + logger.error("Unable to write tracking file.", exc_info=True) + + +def faux_return_code(step_info: StepInfo) -> t.Optional[int]: + """Create a faux return code for a task run by the WLM. Must not be + called with non-terminal statuses or results may be confusing + """ + if step_info.status not in TERMINAL_STATUSES: + return None + + if step_info.status == STATUS_COMPLETED: + return os.EX_OK + + return 1 + + +class ManifestEventHandler(PatternMatchingEventHandler): + """The ManifestEventHandler monitors an experiment for changes and updates + a telemetry datastore as needed. + + It contains event handlers that are triggered by changes to a runtime experiment + manifest. The runtime manifest differs from a standard manifest. A runtime manifest + may contain multiple experiment executions in a `runs` collection. + + It also contains a long-polling loop that checks experiment entities for updates + at each timestep. + """ + + def __init__( + self, + pattern: str, + logger: logging.Logger, + ignore_patterns: t.Any = None, + ignore_directories: bool = True, + case_sensitive: bool = False, + ) -> None: + super().__init__( + [pattern], ignore_patterns, ignore_directories, case_sensitive + ) # type: ignore + self._logger = logger + self._tracked_runs: t.Dict[int, Run] = {} + self._tracked_jobs: t.Dict[_JobKey, JobEntity] = {} + self._completed_jobs: t.Dict[_JobKey, JobEntity] = {} + self._launcher: t.Optional[Launcher] = None + self.job_manager: JobManager = JobManager(threading.RLock()) + self._launcher_map: t.Dict[str, t.Type[Launcher]] = { + "slurm": SlurmLauncher, + "pbs": PBSLauncher, + "cobalt": CobaltLauncher, + "lsf": LSFLauncher, + "local": LocalLauncher, + } + + def init_launcher(self, launcher: str) -> Launcher: + """Initialize the controller with a specific type of launcher. + SmartSim currently supports slurm, pbs(pro), cobalt, lsf, + and local launching + + :param launcher: which launcher to initialize + :type launcher: str + :raises SSUnsupportedError: if a string is passed that is not + a supported launcher + :raises TypeError: if no launcher argument is provided. + """ + if not launcher: + raise TypeError("Must provide a 'launcher' argument") + + if launcher_type := self._launcher_map.get(launcher.lower(), None): + return launcher_type() + + raise ValueError("Launcher type not supported: " + launcher) + + def set_launcher(self, launcher_type: str) -> None: + """Set the launcher for the experiment""" + self._launcher = self.init_launcher(launcher_type) + self.job_manager.set_launcher(self._launcher) + self.job_manager.start() + + def process_manifest(self, manifest_path: str) -> None: + """Read the runtime manifest for the experiment and track new entities + + :param manifest_path: The full path to the manifest file + :type manifest_path: str + """ + try: + manifest = load_manifest(manifest_path) + if not manifest: + return + except json.JSONDecodeError: + self._logger.error(f"Malformed manifest encountered: {manifest_path}") + return + except ValueError: + self._logger.error("Manifest content error", exc_info=True) + return + + if self._launcher is None: + self.set_launcher(manifest.launcher) + + if not self._launcher: + raise SmartSimError(f"Unable to set launcher from {manifest_path}") + + runs = [run for run in manifest.runs if run.timestamp not in self._tracked_runs] + + exp_dir = pathlib.Path(manifest_path).parent.parent.parent + + for run in runs: + for entity in run.flatten( + filter_fn=lambda e: e.key not in self._tracked_jobs and e.is_managed + ): + entity.path = str(exp_dir) + + self._tracked_jobs[entity.key] = entity + track_event( + run.timestamp, + entity.task_id, + entity.step_id, + entity.type, + "start", + pathlib.Path(entity.status_dir), + self._logger, + ) + + if entity.is_managed: + self.job_manager.add_job( + entity.name, + entity.task_id, + entity, + False, + ) + self._launcher.step_mapping.add( + entity.name, entity.step_id, entity.task_id, True + ) + self._tracked_runs[run.timestamp] = run + + def on_modified(self, event: FileModifiedEvent) -> None: + """Event handler for when a file or directory is modified. + + :param event: Event representing file/directory modification. + :type event: FileModifiedEvent + """ + super().on_modified(event) # type: ignore + self._logger.info(f"processing manifest modified @ {event.src_path}") + self.process_manifest(event.src_path) + + def on_created(self, event: FileCreatedEvent) -> None: + """Event handler for when a file or directory is created. + + :param event: Event representing file/directory creation. + :type event: FileCreatedEvent + """ + super().on_created(event) # type: ignore + self._logger.info(f"processing manifest created @ {event.src_path}") + self.process_manifest(event.src_path) + + def _to_completed( + self, + timestamp: int, + entity: JobEntity, + step_info: StepInfo, + ) -> None: + """Move a monitored entity from the active to completed collection to + stop monitoring for updates during timesteps. + + :param timestamp: the current timestamp for event logging + :type timestamp: int + :param entity: the running SmartSim Job + :type entity: JobEntity + :param experiment_dir: the experiement directory to monitor for changes + :type experiment_dir: pathlib.Path + :param entity: the StepInfo received when requesting a Job status update + :type entity: StepInfo + """ + inactive_entity = self._tracked_jobs.pop(entity.key) + if entity.key not in self._completed_jobs: + self._completed_jobs[entity.key] = inactive_entity + + job = self.job_manager[entity.name] + self.job_manager.move_to_completed(job) + + status_clause = f"status: {step_info.status}" + error_clause = f", error: {step_info.error}" if step_info.error else "" + detail = f"{status_clause}{error_clause}" + + if hasattr(job.entity, "status_dir"): + write_path = pathlib.Path(job.entity.status_dir) + + track_event( + timestamp, + entity.task_id, + entity.step_id, + entity.type, + "stop", + write_path, + self._logger, + detail=detail, + return_code=faux_return_code(step_info), + ) + + def on_timestep(self, timestamp: int) -> None: + """Called at polling frequency to request status updates on + monitored entities + + :param timestamp: the current timestamp for event logging + :type timestamp: int + :param experiment_dir: the experiement directory to monitor for changes + :type experiment_dir: pathlib.Path + """ + entity_map = self._tracked_jobs + + if not self._launcher: + return + + # consider not using name to avoid collisions + names = {entity.name: entity for entity in entity_map.values()} + + if names: + step_updates = self._launcher.get_step_update(list(names.keys())) + + for step_name, step_info in step_updates: + if step_info and step_info.status in TERMINAL_STATUSES: + completed_entity = names[step_name] + self._to_completed(timestamp, completed_entity, step_info) + + +def can_shutdown(action_handler: ManifestEventHandler, logger: logging.Logger) -> bool: + jobs = action_handler.job_manager.jobs + db_jobs = action_handler.job_manager.db_jobs + + has_jobs = bool(jobs) + has_dbs = bool(db_jobs) + has_running_jobs = has_jobs or has_dbs + + if has_jobs: + logger.debug(f"telemetry monitor is monitoring {len(jobs)} jobs") + if has_dbs: + logger.debug(f"telemetry monitor is monitoring {len(db_jobs)} dbs") + + return not has_running_jobs + + +def event_loop( + observer: BaseObserver, + action_handler: ManifestEventHandler, + frequency: t.Union[int, float], + logger: logging.Logger, + cooldown_duration: int, +) -> None: + """Executes all attached timestep handlers every seconds + + :param observer: (optional) a preconfigured watchdog Observer to inject + :type observer: t.Optional[BaseObserver] + :param action_handler: The manifest event processor instance + :type action_handler: ManifestEventHandler + :param frequency: frequency (in seconds) of update loop + :type frequency: t.Union[int, float] + :param logger: a preconfigured Logger instance + :type logger: logging.Logger + :param cooldown_duration: number of seconds the telemetry monitor should + poll for new jobs before attempting to shutdown + :type cooldown_duration: int + """ + elapsed: int = 0 + last_ts: int = get_ts() + + while observer.is_alive(): + timestamp = get_ts() + logger.debug(f"Telemetry timestep: {timestamp}") + action_handler.on_timestep(timestamp) + + elapsed += timestamp - last_ts + last_ts = timestamp + + if can_shutdown(action_handler, logger): + if elapsed >= cooldown_duration: + logger.info("beginning telemetry manager shutdown") + observer.stop() # type: ignore + else: + # reset cooldown any time there are still jobs running + elapsed = 0 + + time.sleep(frequency) + + +def main( + frequency: t.Union[int, float], + experiment_dir: pathlib.Path, + logger: logging.Logger, + observer: t.Optional[BaseObserver] = None, + cooldown_duration: t.Optional[int] = 0, +) -> int: + """Setup the monitoring entities and start the timer-based loop that + will poll for telemetry data + + :param frequency: frequency (in seconds) of update loop + :type frequency: t.Union[int, float] + :param experiment_dir: the experiement directory to monitor for changes + :type experiment_dir: pathlib.Path + :param logger: a preconfigured Logger instance + :type logger: logging.Logger + :param observer: (optional) a preconfigured Observer to inject + :type observer: t.Optional[BaseObserver] + :param cooldown_duration: number of seconds the telemetry monitor should + poll for new jobs before attempting to shutdown + :type cooldown_duration: int + """ + manifest_relpath = pathlib.Path(TELMON_SUBDIR) / MANIFEST_FILENAME + manifest_path = experiment_dir / manifest_relpath + monitor_pattern = str(manifest_relpath) + + logger.info( + f"Executing telemetry monitor with frequency: {frequency}s" + f", on target directory: {experiment_dir}" + f" matching pattern: {monitor_pattern}" + ) + + cooldown_duration = cooldown_duration or CONFIG.telemetry_cooldown + log_handler = LoggingEventHandler(logger) # type: ignore + action_handler = ManifestEventHandler(monitor_pattern, logger) + + if observer is None: + observer = Observer() + + try: + if manifest_path.exists(): + # a manifest may not exist depending on startup timing + action_handler.process_manifest(str(manifest_path)) + + observer.schedule(log_handler, experiment_dir, recursive=True) # type:ignore + observer.schedule(action_handler, experiment_dir, recursive=True) # type:ignore + observer.start() # type: ignore + + event_loop(observer, action_handler, frequency, logger, cooldown_duration) + return os.EX_OK + except Exception as ex: + logger.error(ex) + finally: + if observer.is_alive(): + observer.stop() # type: ignore + observer.join() + + return os.EX_SOFTWARE + + +def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: + """Helper function to ensure clean process termination""" + if not signo: + logger = logging.getLogger() + logger.warning("Received signal with no signo") + + +def register_signal_handlers() -> None: + """Register a signal handling function for all termination events""" + for sig in SIGNALS: + signal.signal(sig, handle_signal) + + +def get_parser() -> argparse.ArgumentParser: + """Instantiate a parser to process command line arguments""" + arg_parser = argparse.ArgumentParser(description="SmartSim Telemetry Monitor") + arg_parser.add_argument( + "-frequency", + type=int, + help="Frequency of telemetry updates (in seconds))", + required=True, + ) + arg_parser.add_argument( + "-exp_dir", + type=str, + help="Experiment root directory", + required=True, + ) + arg_parser.add_argument( + "-cooldown", + type=int, + help="Default lifetime of telemetry monitor (in seconds) before auto-shutdown", + default=CONFIG.telemetry_cooldown, + ) + return arg_parser + + +if __name__ == "__main__": + os.environ["PYTHONUNBUFFERED"] = "1" + + parser = get_parser() + args = parser.parse_args() + + log = logging.getLogger(f"{__name__}.TelemetryMonitor") + log.setLevel(logging.DEBUG) + log.propagate = False + + log_path = os.path.join(args.exp_dir, TELMON_SUBDIR, "telemetrymonitor.log") + fh = logging.FileHandler(log_path, "a") + log.addHandler(fh) + + # Must register cleanup before the main loop is running + register_signal_handlers() + + try: + main( + int(args.frequency), + pathlib.Path(args.exp_dir), + log, + cooldown_duration=args.cooldown, + ) + sys.exit(0) + except Exception: + log.exception( + "Shutting down telemetry monitor due to unexpected error", exc_info=True + ) + + sys.exit(1) diff --git a/smartsim/_core/launcher/cobalt/cobaltLauncher.py b/smartsim/_core/launcher/cobalt/cobaltLauncher.py index ca0b88a3b..4c7206969 100644 --- a/smartsim/_core/launcher/cobalt/cobaltLauncher.py +++ b/smartsim/_core/launcher/cobalt/cobaltLauncher.py @@ -117,16 +117,13 @@ def run(self, step: Step) -> t.Optional[str]: # aprun doesn't direct output for us. out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) # if batch submission did not successfully retrieve job ID diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index ec8bb0120..1441fe8b0 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -47,11 +47,6 @@ class Launcher(abc.ABC): # pragma: no cover step_mapping: StepMapping task_manager: TaskManager - @property - @abc.abstractmethod - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: - raise NotImplementedError - @abc.abstractmethod def create_step(self, name: str, cwd: str, step_settings: SettingsBase) -> Step: raise NotImplementedError @@ -86,6 +81,11 @@ def __init__(self) -> None: self.task_manager = TaskManager() self.step_mapping = StepMapping() + @property + @abc.abstractmethod + def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: + raise NotImplementedError + # every launcher utilizing this interface must have a map # of supported RunSettings types (see slurmLauncher.py for ex) def create_step( @@ -176,6 +176,6 @@ def _get_unmanaged_step_update( # pylint: disable-next=no-self-use def _get_managed_step_update( self, - step_ids: t.List[str], # pylint: disable=unused-argument + step_ids: t.List[str], # pylint: disable=unused-argument ) -> t.List[StepInfo]: # pragma: no cover return [] diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py index 7e5c56f7b..3f0f2d8d2 100644 --- a/smartsim/_core/launcher/local/local.py +++ b/smartsim/_core/launcher/local/local.py @@ -24,29 +24,24 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os +import sys import typing as t from ..launcher import Launcher from ....log import get_logger from ....settings import RunSettings, SettingsBase -from ..step import LocalStep -from ..step import Step +from ..step import LocalStep, Step from ..stepInfo import UnmanagedStepInfo, StepInfo from ..stepMapping import StepMapping from ..taskManager import TaskManager - -logger = get_logger(__name__) +from ...utils.helpers import encode_cmd +from ...config import CONFIG class LocalLauncher(Launcher): """Launcher used for spawning proceses on a localhost machine.""" - @property - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: - return { - RunSettings: LocalStep, - } - def __init__(self) -> None: self.task_manager = TaskManager() self.step_mapping = StepMapping() @@ -60,16 +55,17 @@ def create_step(self, name: str, cwd: str, step_settings: SettingsBase) -> Step: raise TypeError( f"Local Launcher only supports entities with RunSettings, not {type(step_settings)}" ) - step = LocalStep(name, cwd, step_settings) - return step + return LocalStep(name, cwd, step_settings) - def get_step_update(self, step_names: t.List[str]) -> t.List[t.Tuple[str, t.Optional[StepInfo]]]: + def get_step_update( + self, step_names: t.List[str] + ) -> t.List[t.Tuple[str, t.Optional[StepInfo]]]: """Get status updates of each job step name provided :param step_names: list of step_names :type step_names: list[str] :return: list of tuples for update - :rtype: list[(str, UnmanagedStepInfo)] + :rtype: list[tuple[str, StepInfo | None]] """ # step ids are process ids of the tasks # as there is no WLM intermediary @@ -85,8 +81,12 @@ def get_step_update(self, step_names: t.List[str]) -> t.List[t.Tuple[str, t.Opti def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: """Return the address of nodes assigned to the step + :param step_names: list of step_names + :type step_names: list[str] + :return: list of node addresses + :rtype: list[list[str]] + TODO: Use socket to find the actual Lo address? - :return: a list containing the local host address """ return [["127.0.0.1"] * len(step_names)] @@ -104,16 +104,17 @@ def run(self, step: Step) -> str: self.task_manager.start() out, err = step.get_output_files() - output = open(out, "w+") - error = open(err, "w+") cmd = step.get_launch_cmd() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None + # pylint: disable-next=consider-using-with + output = open(out, "w+", encoding="utf-8") + # pylint: disable-next=consider-using-with + error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd, step.cwd, env=passed_env, out=output.fileno(), err=error.fileno() + cmd, step.cwd, env=step.env, out=output.fileno(), err=error.fileno() ) + self.step_mapping.add(step.name, task_id=task_id, managed=False) return task_id @@ -127,7 +128,7 @@ def stop(self, step_name: str) -> UnmanagedStepInfo: """ # step_id is task_id for local. Naming for consistency step_id = self.step_mapping[step_name].task_id - + self.task_manager.remove_task(str(step_id)) _, rc, out, err = self.task_manager.get_task_update(str(step_id)) step_info = UnmanagedStepInfo("Cancelled", rc, out, err) diff --git a/smartsim/_core/launcher/lsf/lsfLauncher.py b/smartsim/_core/launcher/lsf/lsfLauncher.py index a8d0e27aa..13b3be9bb 100644 --- a/smartsim/_core/launcher/lsf/lsfLauncher.py +++ b/smartsim/_core/launcher/lsf/lsfLauncher.py @@ -42,13 +42,13 @@ from ...config import CONFIG from ..launcher import WLMLauncher from ..step import ( - Step, BsubBatchStep, JsrunStep, LocalStep, MpiexecStep, MpirunStep, OrterunStep, + Step, ) from ..stepInfo import LSFBatchStepInfo, LSFJsrunStepInfo, StepInfo from .lsfCommands import bjobs, bkill, jskill, jslist @@ -115,19 +115,16 @@ def run(self, step: Step) -> t.Optional[str]: time.sleep(1) step_id = self._get_lsf_step_id(step) logger.debug(f"Gleaned jsrun step id: {step_id} for {step.name}") - else: # isinstance(step, MpirunStep) or isinstance(step, LocalStep) + else: # mpirun and local launch don't direct output for us out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) self.step_mapping.add(step.name, step_id, task_id, step.managed) diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index cbb85337c..f7d854a7b 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -111,15 +111,12 @@ def run(self, step: Step) -> t.Optional[str]: # aprun/local doesn't direct output for us. out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) # if batch submission did not successfully retrieve job ID diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index 70bdab5a2..ae44ddc8e 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -155,15 +155,12 @@ def run(self, step: Step) -> t.Optional[str]: # MPI/local steps don't direct output like slurm steps out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) if not step_id and step.managed: diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index 80e7e7658..6169df083 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -31,7 +31,7 @@ from ....error import AllocationError from ....log import get_logger -from .step import Step +from .step import Step, proxyable_launch_cmd from ....settings import AprunSettings, RunSettings, Singularity logger = get_logger(__name__) @@ -56,9 +56,11 @@ def __init__(self, name: str, cwd: str, run_settings: AprunSettings) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ return self.run_settings.mpmd + @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step diff --git a/smartsim/_core/launcher/step/localStep.py b/smartsim/_core/launcher/step/localStep.py index d15a48381..709137e5b 100644 --- a/smartsim/_core/launcher/step/localStep.py +++ b/smartsim/_core/launcher/step/localStep.py @@ -28,7 +28,7 @@ import shutil import typing as t -from .step import Step +from .step import Step, proxyable_launch_cmd from ....settings.base import RunSettings from ....settings import Singularity @@ -37,8 +37,13 @@ class LocalStep(Step): def __init__(self, name: str, cwd: str, run_settings: RunSettings): super().__init__(name, cwd, run_settings) self.run_settings = run_settings - self.env = self._set_env() + self._env = self._set_env() + @property + def env(self) -> t.Dict[str, str]: + return self._env + + @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: cmd = [] diff --git a/smartsim/_core/launcher/step/lsfStep.py b/smartsim/_core/launcher/step/lsfStep.py index ae6c3525b..a10827950 100644 --- a/smartsim/_core/launcher/step/lsfStep.py +++ b/smartsim/_core/launcher/step/lsfStep.py @@ -213,7 +213,8 @@ def _set_alloc(self) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ if isinstance(self.step_settings, JsrunSettings): return self.step_settings.mpmd return [] diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 9a0796c0f..8ab6c0d47 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -26,12 +26,12 @@ import os import shutil -from shlex import split as sh_split import typing as t +from shlex import split as sh_split from ....error import AllocationError, SmartSimError from ....log import get_logger -from .step import Step +from .step import Step, proxyable_launch_cmd from ....settings import MpirunSettings, MpiexecSettings, OrterunSettings from ....settings.base import RunSettings @@ -59,6 +59,7 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings) -> None: _supported_launchers = ["PBS", "COBALT", "SLURM", "LSB"] + @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step @@ -118,7 +119,8 @@ def _set_alloc(self) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ if hasattr(self.run_settings, "mpmd") and self.run_settings.mpmd: rs_mpmd: t.List[RunSettings] = self.run_settings.mpmd return rs_mpmd diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index 18575e4e9..67353faa7 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -26,13 +26,13 @@ import os import shutil -from shlex import split as sh_split import typing as t +from shlex import split as sh_split from ....error import AllocationError from ....log import get_logger from .step import Step -from ....settings import SrunSettings, SbatchSettings, RunSettings, Singularity +from ....settings import RunSettings, SbatchSettings, Singularity, SrunSettings logger = get_logger(__name__) @@ -189,13 +189,15 @@ def _set_alloc(self) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ return self.run_settings.mpmd @staticmethod def _get_exe_args_list(run_setting: RunSettings) -> t.List[str]: """Convenience function to encapsulate checking the - runsettings.exe_args type to always return a list""" + runsettings.exe_args type to always return a list + """ exe_args = run_setting.exe_args args: t.List[str] = exe_args if isinstance(exe_args, list) else [exe_args] return args diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index 2aa995768..d77616cc2 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -26,17 +26,20 @@ from __future__ import annotations +import functools import os.path as osp +import sys import time import typing as t - from os import makedirs -from smartsim.error.errors import SmartSimError + +from smartsim.error.errors import SmartSimError, UnproxyableStepError +from smartsim._core.config import CONFIG from ....log import get_logger -from ...utils.helpers import get_base_36_repr +from ...utils.helpers import get_base_36_repr, encode_cmd from ..colocated import write_colocated_launch_script -from ....settings.base import SettingsBase, RunSettings +from ....settings.base import RunSettings, SettingsBase logger = get_logger(__name__) @@ -48,6 +51,12 @@ def __init__(self, name: str, cwd: str, step_settings: SettingsBase) -> None: self.cwd = cwd self.managed = False self.step_settings = step_settings + self.meta: t.Dict[str, str] = {} + + @property + def env(self) -> t.Optional[t.Dict[str, str]]: + """Overridable, read only property for step to specify its environment""" + return None def get_launch_cmd(self) -> t.List[str]: raise NotImplementedError @@ -68,7 +77,8 @@ def get_step_file( ) -> str: """Get the name for a file/script created by the step class - Used for Batch scripts, mpmd scripts, etc""" + Used for Batch scripts, mpmd scripts, etc. + """ if script_name: script_name = script_name if "." in script_name else script_name + ending return osp.join(self.cwd, script_name) @@ -107,3 +117,49 @@ def add_to_batch(self, step: Step) -> None: :type step: Step """ raise SmartSimError("add_to_batch not implemented for this step type") + + +_StepT = t.TypeVar("_StepT", bound=Step) + + +def proxyable_launch_cmd( + fn: t.Callable[[_StepT], t.List[str]], / +) -> t.Callable[[_StepT], t.List[str]]: + @functools.wraps(fn) + def _get_launch_cmd(self: _StepT) -> t.List[str]: + original_cmd_list = fn(self) + + if not CONFIG.telemetry_enabled: + return original_cmd_list + + if self.managed: + raise UnproxyableStepError( + f"Attempting to proxy managed step of type {type(self)}" + "through the unmanaged step proxy entry point" + ) + + proxy_module = "smartsim._core.entrypoints.indirect" + etype = self.meta["entity_type"] + status_dir = self.meta["status_dir"] + encoded_cmd = encode_cmd(original_cmd_list) + + # NOTE: this is NOT safe. should either 1) sign cmd and verify OR 2) + # serialize step and let the indirect entrypoint rebuild the + # cmd... for now, test away... + return [ + sys.executable, + "-m", + proxy_module, + "+name", + self.name, + "+command", + encoded_cmd, + "+entity_type", + etype, + "+telemetry_dir", + status_dir, + "+working_dir", + self.cwd, + ] + + return _get_launch_cmd diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index 5d6b6d769..fea0269f0 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -27,9 +27,11 @@ """ A file of helper functions for SmartSim """ +import base64 import os import uuid import typing as t +from datetime import datetime from functools import lru_cache from pathlib import Path from shutil import which @@ -64,21 +66,20 @@ def unpack_colo_db_identifier(db_id: str) -> str: return "_" + db_id if db_id else "" +def create_short_id_str() -> str: + return str(uuid.uuid4())[:7] + + def create_lockfile_name() -> str: """Generate a unique lock filename using UUID""" - lock_suffix = str(uuid.uuid4())[:7] + lock_suffix = create_short_id_str() return f"smartsim-{lock_suffix}.lock" @lru_cache(maxsize=20, typed=False) def check_dev_log_level() -> bool: - try: - lvl = os.environ["SMARTSIM_LOG_LEVEL"] - if lvl == "developer": - return True - return False - except KeyError: - return False + lvl = os.environ.get("SMARTSIM_LOG_LEVEL", "") + return lvl == "developer" def fmt_dict(value: t.Dict[str, t.Any]) -> str: @@ -273,3 +274,31 @@ def installed_redisai_backends( } return {backend for backend in backends if _installed(base_path, backend)} + + +def get_ts() -> int: + """Return the current timestamp (accurate to seconds) cast to an integer""" + return int(datetime.timestamp(datetime.now())) + + +def encode_cmd(cmd: t.List[str]) -> str: + """Transform a standard command list into an encoded string safe for providing as an + argument to a proxy entrypoint + """ + if not cmd: + raise ValueError("Invalid cmd supplied") + + ascii_cmd = "|".join(cmd).encode("ascii") + encoded_cmd = base64.b64encode(ascii_cmd).decode("ascii") + return encoded_cmd + + +def decode_cmd(encoded_cmd: str) -> t.List[str]: + """Decode an encoded command string to the original command list format""" + if not encoded_cmd.strip(): + raise ValueError("Invalid cmd supplied") + + decoded_cmd = base64.b64decode(encoded_cmd.encode("ascii")) + cleaned_cmd = decoded_cmd.decode("ascii").split("|") + + return cleaned_cmd diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py new file mode 100644 index 000000000..5547a49f8 --- /dev/null +++ b/smartsim/_core/utils/serialize.py @@ -0,0 +1,246 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import json +import time +import typing as t +from pathlib import Path + +import smartsim.log +import smartsim._core._cli.utils as _utils +from smartsim._core.config import CONFIG + +if t.TYPE_CHECKING: + from smartsim import Experiment + from smartsim._core.control.manifest import LaunchedManifest as _Manifest + from smartsim.database.orchestrator import Orchestrator + from smartsim.entity import DBNode, Ensemble, Model + from smartsim.entity.dbobject import DBModel, DBScript + from smartsim.settings.base import BatchSettings, RunSettings + + +TStepLaunchMetaData = t.Tuple[ + t.Optional[str], t.Optional[str], t.Optional[bool], str, str, Path +] +TELMON_SUBDIR: t.Final[str] = ".smartsim/telemetry" +MANIFEST_FILENAME: t.Final[str] = "manifest.json" + +_LOGGER = smartsim.log.get_logger(__name__) + + +def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: + if not CONFIG.telemetry_enabled: + return + + manifest.metadata.run_telemetry_subdirectory.mkdir(parents=True, exist_ok=True) + + new_run = { + "run_id": manifest.metadata.run_id, + "timestamp": int(time.time_ns()), + "model": [ + _dictify_model(model, *telemetry_metadata) + for model, telemetry_metadata in manifest.models + ], + "orchestrator": [ + _dictify_db(db, nodes_info) for db, nodes_info in manifest.databases + ], + "ensemble": [ + _dictify_ensemble(ens, member_info) + for ens, member_info in manifest.ensembles + ], + } + try: + with open(manifest.metadata.manifest_file_path, "r", encoding="utf-8") as file: + manifest_dict = json.load(file) + except (FileNotFoundError, json.JSONDecodeError): + manifest_dict = { + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1", + }, + "experiment": { + "name": manifest.metadata.exp_name, + "path": manifest.metadata.exp_path, + "launcher": manifest.metadata.launcher_name, + }, + "runs": [new_run], + } + else: + manifest_dict["runs"].append(new_run) + finally: + with open(manifest.metadata.manifest_file_path, "w", encoding="utf-8") as file: + json.dump(manifest_dict, file, indent=2) + + +def _dictify_model( + model: Model, + step_id: t.Optional[str], + task_id: t.Optional[str], + managed: t.Optional[bool], + out_file: str, + err_file: str, + telemetry_data_path: Path, +) -> t.Dict[str, t.Any]: + colo_settings = (model.run_settings.colocated_db_settings or {}).copy() + db_scripts = t.cast("t.List[DBScript]", colo_settings.pop("db_scripts", [])) + db_models = t.cast("t.List[DBModel]", colo_settings.pop("db_models", [])) + return { + "name": model.name, + "path": model.path, + "exe_args": model.run_settings.exe_args, + "run_settings": _dictify_run_settings(model.run_settings), + "batch_settings": _dictify_batch_settings(model.batch_settings) + if model.batch_settings + else {}, + "params": model.params, + "files": { + "Symlink": model.files.link, + "Configure": model.files.tagged, + "Copy": model.files.copy, + } + if model.files + else { + "Symlink": [], + "Configure": [], + "Copy": [], + }, + "colocated_db": { + "settings": colo_settings, + "scripts": [ + { + script.name: { + "backend": "TORCH", + "device": script.device, + } + } + for script in db_scripts + ], + "models": [ + { + model.name: { + "backend": model.backend, + "device": model.device, + } + } + for model in db_models + ], + } + if colo_settings + else {}, + "telemetry_metadata": { + "status_dir": str(telemetry_data_path), + "step_id": step_id, + "task_id": task_id, + "managed": managed, + }, + "out_file": out_file, + "err_file": err_file, + } + + +def _dictify_ensemble( + ens: Ensemble, + members: t.Sequence[t.Tuple[Model, TStepLaunchMetaData]], +) -> t.Dict[str, t.Any]: + return { + "name": ens.name, + "params": ens.params, + "batch_settings": _dictify_batch_settings(ens.batch_settings) + # FIXME: Typehint here is wrong, ``ens.batch_settings`` can + # also be an empty dict for no discernible reason... + if ens.batch_settings else {}, + "models": [ + _dictify_model(model, *launching_metadata) + for model, launching_metadata in members + ], + } + + +def _dictify_run_settings(run_settings: RunSettings) -> t.Dict[str, t.Any]: + # TODO: remove this downcast + if hasattr(run_settings, "mpmd") and run_settings.mpmd: + _LOGGER.warning( + "SmartSim currently cannot properly serialize all information in " + "MPMD run settings" + ) + return { + "exe": run_settings.exe, + # TODO: We should try to move this back + # "exe_args": run_settings.exe_args, + "run_command": run_settings.run_command, + "run_args": run_settings.run_args, + # TODO: We currently do not have a way to represent MPMD commands! + # Maybe add a ``"mpmd"`` key here that is a + # ``list[TDictifiedRunSettings]``? + } + + +def _dictify_batch_settings(batch_settings: BatchSettings) -> t.Dict[str, t.Any]: + return { + "batch_command": batch_settings.batch_cmd, + "batch_args": batch_settings.batch_args, + } + + +def _dictify_db( + db: Orchestrator, + nodes: t.Sequence[t.Tuple[DBNode, TStepLaunchMetaData]], +) -> t.Dict[str, t.Any]: + db_path = _utils.get_db_path() + if db_path: + db_type, _ = db_path.name.split("-", 1) + else: + db_type = "Unknown" + return { + "name": db.name, + "type": db_type, + "interface": db._interfaces, # pylint: disable=protected-access + "shards": [ + { + **shard.to_dict(), + "conf_file": shard.cluster_conf_file, + "out_file": out_file, + "err_file": err_file, + "telemetry_metadata": { + "status_dir": str(status_dir), + "step_id": step_id, + "task_id": task_id, + "managed": managed, + }, + } + for dbnode, ( + step_id, + task_id, + managed, + out_file, + err_file, + status_dir, + ) in nodes + for shard in dbnode.get_launched_shard_info() + ], + } diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 35445c42d..ba9a50c80 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -76,11 +76,12 @@ def __init__( @property def num_shards(self) -> int: - try: - return len(self.run_settings.mpmd) + 1 # type: ignore[attr-defined] - except AttributeError: + if not hasattr(self.run_settings, "mpmd"): + # return default number of shards if mpmd is not set return 1 + return len(self.run_settings.mpmd) + 1 + @property def host(self) -> str: try: @@ -99,11 +100,12 @@ def hosts(self) -> t.List[str]: @property def is_mpmd(self) -> bool: - try: - return bool(self.run_settings.mpmd) # type: ignore[attr-defined] - except AttributeError: + if not hasattr(self.run_settings, "mpmd"): + # missing mpmd property guarantees this is not an mpmd run return False + return bool(self.run_settings.mpmd) + def set_hosts(self, hosts: t.List[str]) -> None: self._hosts = [str(host) for host in hosts] diff --git a/smartsim/error/errors.py b/smartsim/error/errors.py index ffa1cfb17..ad67ae88b 100644 --- a/smartsim/error/errors.py +++ b/smartsim/error/errors.py @@ -39,12 +39,14 @@ class SSUnsupportedError(Exception): class EntityExistsError(SmartSimError): """Raised when a user tries to create an entity or files/directories for - an entity and either the entity/files/directories already exist""" + an entity and either the entity/files/directories already exist + """ class UserStrategyError(SmartSimError): """Raised when there is an error with model creation inside an ensemble - that is from a user provided permutation strategy""" + that is from a user provided permutation strategy + """ def __init__(self, perm_strat: str) -> None: message = self.create_message(perm_strat) @@ -80,16 +82,15 @@ class SSReservedKeywordError(SmartSimError): class SSDBIDConflictError(SmartSimError): """Raised in the event that a database identifier - is not unique when multiple databases are created""" + is not unique when multiple databases are created + """ # Internal Exceptions class SSInternalError(Exception): - """ - SSInternalError is raised when an internal error is encountered. - """ + """SSInternalError is raised when an internal error is encountered""" class SSConfigError(SSInternalError): @@ -106,7 +107,8 @@ class AllocationError(LauncherError): class ShellError(LauncherError): """Raised when error arises from function within launcher.shell - Closely related to error from subprocess(Popen) commands""" + Closely related to error from subprocess(Popen) commands + """ def __init__( self, @@ -130,3 +132,17 @@ def create_message( if details: msg += f"\nError from shell: {details}" return msg + + +class TelemetryError(SSInternalError): + """Raised when SmartSim runs into trouble establishing or communicating + telemetry information + """ + +class UnproxyableStepError(TelemetryError): + """Raised when a user attempts to proxy a managed ``Step`` through the + unmanaged step proxy entry point + """ + +class SmartSimCLIActionCancelled(SmartSimError): + """Raised when a `smart` CLI command is terminated""" diff --git a/smartsim/experiment.py b/smartsim/experiment.py index a9d275088..b3ef2fd09 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os import os.path as osp import typing as t from os import getcwd @@ -194,6 +195,8 @@ def start( if summary: self._launch_summary(start_manifest) self._control.start( + exp_name=self.name, + exp_path=self.exp_path, manifest=start_manifest, block=block, kill_on_interrupt=kill_on_interrupt, @@ -878,3 +881,35 @@ def append_to_db_identifier_list(self, db_identifier: str) -> None: ) # Otherwise, add self.db_identifiers.add(db_identifier) + + def enable_telemetry(self) -> None: + """Experiments will start producing telemetry for all entities run + through ``Experiment.start`` + + .. warning:: + + This method is currently implemented so that ALL ``Experiment`` + instances will begin producing telemetry data. In the future it + is planned to have this method work on a "per instance" basis! + """ + self._set_telemetry(True) + + def disable_telemetry(self) -> None: + """Experiments will stop producing telemetry for all entities run + through ``Experiment.start`` + + .. warning:: + + This method is currently implemented so that ALL ``Experiment`` + instances will stop producing telemetry data. In the future it + is planned to have this method work on a "per instance" basis! + """ + self._set_telemetry(False) + + @staticmethod + def _set_telemetry(switch: bool, /) -> None: + tm_key = "SMARTSIM_FLAG_TELEMETRY" + if switch: + os.environ[tm_key] = "1" + else: + os.environ[tm_key] = "0" diff --git a/smartsim/log.py b/smartsim/log.py index 9011b3d1b..72d5ad817 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -30,12 +30,16 @@ import coloredlogs -# constants for logging -coloredlogs.DEFAULT_DATE_FORMAT = "%H:%M:%S" -coloredlogs.DEFAULT_LOG_FORMAT = ( +# constants +DEFAULT_DATE_FORMAT: t.Final[str] = "%H:%M:%S" +DEFAULT_LOG_FORMAT: t.Final[str] = ( "%(asctime)s %(hostname)s %(name)s[%(process)d] %(levelname)s %(message)s" ) +# configure colored loggs +coloredlogs.DEFAULT_DATE_FORMAT = DEFAULT_DATE_FORMAT +coloredlogs.DEFAULT_LOG_FORMAT = DEFAULT_LOG_FORMAT + def _get_log_level() -> str: """Get the logging level based on environment variable diff --git a/smartsim/wlm/slurm.py b/smartsim/wlm/slurm.py index 8fe12b3f9..ba46fb64c 100644 --- a/smartsim/wlm/slurm.py +++ b/smartsim/wlm/slurm.py @@ -237,7 +237,8 @@ def _get_alloc_cmd( options: t.Optional[t.Dict[str, str]] = None, ) -> t.List[str]: """Return the command to request an allocation from Slurm with - the class variables as the slurm options.""" + the class variables as the slurm options. + """ salloc_args = [ "--no-shell", diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 386631a50..7472db706 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -398,7 +398,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experience - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -469,7 +469,7 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") # Create the SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -633,7 +633,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -735,7 +735,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -818,6 +818,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): with pytest.raises(SSUnsupportedError): colo_ensemble.add_model(colo_model) + @pytest.mark.skipif(not should_run_tf, reason="Test needs TensorFlow to run") def test_inconsistent_params_db_model(): """Test error when devices_per_node parameter>1 when devices is set to CPU in DBModel""" diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index c92be31de..4d1743402 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -245,7 +245,7 @@ def test_colocated_db_script(fileutils, wlmutils, mlutils): torch_script = fileutils.get_test_conf_path("torchscript.py") # Create the SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -313,7 +313,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): torch_script = fileutils.get_test_conf_path("torchscript.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -412,7 +412,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): torch_script = fileutils.get_test_conf_path("torchscript.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -509,7 +509,7 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): torch_script = fileutils.get_test_conf_path("torchscript.py") # Create SmartSim experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 4beccd41b..e3d07118d 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -39,8 +39,10 @@ def test_batch_model(fileutils, wlmutils): """Test the launch of a manually construced batch model""" exp_name = "test-batch-model" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") @@ -64,8 +66,10 @@ def test_batch_ensemble(fileutils, wlmutils): """Test the launch of a manually constructed batch ensemble""" exp_name = "test-batch-ensemble" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") @@ -89,8 +93,10 @@ def test_batch_ensemble(fileutils, wlmutils): def test_batch_ensemble_replicas(fileutils, wlmutils): exp_name = "test-batch-ensemble-replicas" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index 7e5591a30..ab4a3dc59 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -41,8 +41,8 @@ def test_launch_orc_auto_batch(fileutils, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-orc-batch" - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -77,8 +77,8 @@ def test_launch_cluster_orc_batch_single(fileutils, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-batch-single" - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -116,8 +116,8 @@ def test_launch_cluster_orc_batch_multi(fileutils, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-batch-multi" - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -153,8 +153,8 @@ def test_launch_cluster_orc_reconnect(fileutils, wlmutils): """test reconnecting to clustered 3-node orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = "test-launch-cluster-orc-batch-reconect" - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/full_wlm/test_mpmd.py b/tests/full_wlm/test_mpmd.py index 19f4660c2..0ec9fb2c7 100644 --- a/tests/full_wlm/test_mpmd.py +++ b/tests/full_wlm/test_mpmd.py @@ -61,7 +61,8 @@ def test_mpmd(fileutils, wlmutils): "cobalt": ["mpirun"], } - exp = Experiment(exp_name, launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) def prune_commands(launcher): available_commands = [] @@ -77,7 +78,6 @@ def prune_commands(launcher): f"MPMD on {launcher} only supported for run commands {by_launcher[launcher]}" ) - test_dir = fileutils.make_test_dir() for run_command in run_commands: script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings( diff --git a/tests/on_wlm/test_base_settings_on_wlm.py b/tests/on_wlm/test_base_settings_on_wlm.py index 3aa77983f..f555336ec 100644 --- a/tests/on_wlm/test_base_settings_on_wlm.py +++ b/tests/on_wlm/test_base_settings_on_wlm.py @@ -42,8 +42,10 @@ def test_model_on_wlm(fileutils, wlmutils): exp_name = "test-base-settings-model-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") @@ -60,8 +62,10 @@ def test_model_on_wlm(fileutils, wlmutils): def test_model_stop_on_wlm(fileutils, wlmutils): exp_name = "test-base-settings-model-stop" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index a38fabd06..92db78a11 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -47,7 +47,8 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type): db_args = { } - exp = Experiment("colocated_model_defaults", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment("colocated_model_defaults", launcher=launcher, exp_path=test_dir) colo_model = coloutils.setup_test_colo( fileutils, db_type, @@ -69,7 +70,12 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type): @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): - exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment( + "colocated_model_pinning_auto_1cpu", + launcher=launcher, + exp_path=test_dir + ) db_args = { "db_cpus": 1, "custom_pinning": [], @@ -91,7 +97,12 @@ def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type): - exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment( + "colocated_model_pinning_auto_2cpu", + launcher=launcher, + exp_path=test_dir, + ) db_args = { "db_cpus": 2, @@ -115,7 +126,12 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # Assume that there are at least 4 cpus on the node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment( + "colocated_model_pinning_manual", + launcher=launcher, + exp_path=test_dir, + ) db_args = { "db_cpus": 4, @@ -139,7 +155,12 @@ def test_colocated_model_pinning_list(fileutils, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this has more than 2 CPUs on the supercomputer node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment( + "colocated_model_pinning_manual", + launcher=launcher, + exp_path=test_dir, + ) db_args = { "db_cpus": 2, @@ -163,7 +184,12 @@ def test_colocated_model_pinning_mixed(fileutils, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this at least 4 CPUs on the supercomputer node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment( + "colocated_model_pinning_manual", + launcher=launcher, + exp_path=test_dir, + ) db_args = { "db_cpus": 2, diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index 919317c73..7d8143789 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -38,8 +38,8 @@ def test_launch_orc_auto(fileutils, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-orc" - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -71,8 +71,8 @@ def test_launch_cluster_orc_single(fileutils, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-single" - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -105,8 +105,8 @@ def test_launch_cluster_orc_multi(fileutils, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-multi" - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/on_wlm/test_launch_errors.py b/tests/on_wlm/test_launch_errors.py index 77ba8a69a..1b8aeb1f1 100644 --- a/tests/on_wlm/test_launch_errors.py +++ b/tests/on_wlm/test_launch_errors.py @@ -40,8 +40,10 @@ def test_failed_status(fileutils, wlmutils): """Test when a failure occurs deep into model execution""" exp_name = "test-report-failure" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, + launcher=wlmutils.get_test_launcher(), + exp_path=test_dir) script = fileutils.get_test_conf_path("bad.py") settings = exp.create_run_settings( @@ -69,8 +71,8 @@ def test_bad_run_command_args(fileutils, wlmutils): pytest.skip(f"Only fails with slurm. Launcher is {launcher}") exp_name = "test-bad-run-command-args" - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) script = fileutils.get_test_conf_path("bad.py") diff --git a/tests/on_wlm/test_launch_ompi_lsf.py b/tests/on_wlm/test_launch_ompi_lsf.py index e3327514a..144b699ca 100644 --- a/tests/on_wlm/test_launch_ompi_lsf.py +++ b/tests/on_wlm/test_launch_ompi_lsf.py @@ -39,8 +39,8 @@ def test_launch_openmpi_lsf(wlmutils, fileutils): if launcher != "lsf": pytest.skip("Test only runs on systems with LSF as WLM") exp_name = "test-launch-openmpi-lsf" - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", script, "mpirun") diff --git a/tests/on_wlm/test_restart.py b/tests/on_wlm/test_restart.py index 86d883358..baed9c97b 100644 --- a/tests/on_wlm/test_restart.py +++ b/tests/on_wlm/test_restart.py @@ -38,8 +38,10 @@ def test_restart(fileutils, wlmutils): exp_name = "test-restart" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, + launcher=wlmutils.get_test_launcher(), + exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_base_settings_on_wlm.py b/tests/on_wlm/test_simple_base_settings_on_wlm.py index d46a46aae..6a8e3d24f 100644 --- a/tests/on_wlm/test_simple_base_settings_on_wlm.py +++ b/tests/on_wlm/test_simple_base_settings_on_wlm.py @@ -56,8 +56,10 @@ def test_simple_model_on_wlm(fileutils, wlmutils): ) exp_name = "test-simplebase-settings-model-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") @@ -77,8 +79,10 @@ def test_simple_model_stop_on_wlm(fileutils, wlmutils): ) exp_name = "test-simplebase-settings-model-stop" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index 16cfa8f38..f909325cb 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -48,8 +48,10 @@ def test_models(fileutils, wlmutils): exp_name = "test-models-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -65,8 +67,10 @@ def test_models(fileutils, wlmutils): def test_ensemble(fileutils, wlmutils): exp_name = "test-ensemble-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -84,8 +88,10 @@ def test_summary(fileutils, wlmutils): """Fairly rudimentary test of the summary dataframe""" exp_name = "test-launch-summary" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) sleep = fileutils.get_test_conf_path("sleep.py") bad = fileutils.get_test_conf_path("bad.py") diff --git a/tests/on_wlm/test_stop.py b/tests/on_wlm/test_stop.py index a786ce1a4..7f0255f01 100644 --- a/tests/on_wlm/test_stop.py +++ b/tests/on_wlm/test_stop.py @@ -44,8 +44,10 @@ def test_stop_entity(fileutils, wlmutils): exp_name = "test-launch-stop-model" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") @@ -62,8 +64,10 @@ def test_stop_entity(fileutils, wlmutils): def test_stop_entity_list(fileutils, wlmutils): exp_name = "test-launch-stop-ensemble" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir() + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") diff --git a/tests/test_cli.py b/tests/test_cli.py index 79471a355..31fce4cd0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -26,12 +26,15 @@ import argparse from contextlib import contextmanager +import logging +import os +import pathlib import typing as t import pytest import smartsim -from smartsim._core._cli import build, cli +from smartsim._core._cli import build, cli, plugin from smartsim._core._cli.build import configure_parser as build_parser from smartsim._core._cli.build import execute as build_execute from smartsim._core._cli.clean import configure_parser as clean_parser @@ -48,6 +51,14 @@ # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a +_TEST_LOGGER = logging.getLogger(__name__) + +try: + import smartdashboard +except: + test_dash_plugin = False +else: + test_dash_plugin = True def mock_execute_custom(msg: str = None, good: bool = True) -> int: retval = 0 if good else 1 @@ -55,11 +66,11 @@ def mock_execute_custom(msg: str = None, good: bool = True) -> int: return retval -def mock_execute_good(_ns: argparse.Namespace) -> int: +def mock_execute_good(_ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None) -> int: return mock_execute_custom("GOOD THINGS", good = True) -def mock_execute_fail(_ns: argparse.Namespace) -> int: +def mock_execute_fail(_ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None) -> int: return mock_execute_custom("BAD THINGS", good = False) @@ -220,8 +231,8 @@ def test_cli_command_execution(capsys): exp_b_help = "this is my mock help text for build" exp_b_cmd = "build" - dbcli_exec = lambda x: mock_execute_custom(msg="Database", good=True) - build_exec = lambda x: mock_execute_custom(msg="Builder", good=True) + dbcli_exec = lambda x, y: mock_execute_custom(msg="Database", good=True) + build_exec = lambda x, y: mock_execute_custom(msg="Builder", good=True) menu = [cli.MenuItemConfig(exp_a_cmd, exp_a_help, @@ -269,7 +280,7 @@ def test_cli_default_cli(capsys): # show that `smart dbcli` calls the build parser and build execute function assert "usage: smart [-h] " in captured.out assert "Available commands" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `build` argument, expect build-specific help text with pytest.raises(SystemExit) as e: @@ -281,7 +292,7 @@ def test_cli_default_cli(capsys): assert "usage: smart build [-h]" in captured.out assert "Build SmartSim dependencies" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `clean` argument, expect clean-specific help text with pytest.raises(SystemExit) as e: @@ -294,7 +305,7 @@ def test_cli_default_cli(capsys): assert "Remove previous ML runtime installation" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out assert "--clobber" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `dbcli` argument, expect dbcli-specific help text with pytest.raises(SystemExit) as e: @@ -306,7 +317,7 @@ def test_cli_default_cli(capsys): assert "usage: smart dbcli [-h]" in captured.out assert "Print the path to the redis-cli binary" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `site` argument, expect site-specific help text with pytest.raises(SystemExit) as e: @@ -318,7 +329,7 @@ def test_cli_default_cli(capsys): assert "usage: smart site [-h]" in captured.out assert "Print the installation site of SmartSim" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `clobber` argument, expect clobber-specific help text with pytest.raises(SystemExit) as e: @@ -331,8 +342,61 @@ def test_cli_default_cli(capsys): assert "Remove all previous dependency installations" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out # assert "--clobber" not in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE + + +@pytest.mark.skipif(not test_dash_plugin, reason="plugin not found") +def test_cli_plugin_dashboard(capfd): + """Ensure expected dashboard CLI plugin commands are supported""" + smart_cli = cli.default_cli() + capfd.readouterr() # throw away existing output + + # execute with `dashboard` argument, expect dashboard-specific help text + build_args = ["smart", "dashboard", "-h"] + rc = smart_cli.execute(build_args) + + captured = capfd.readouterr() # capture new output + + assert "[-d DIRECTORY]" in captured.out + assert "[-p PORT]" in captured.out + + assert "optional arguments:" in captured.out + assert rc == 0 + + +def test_cli_plugin_invalid( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +): + """Ensure unexpected CLI plugins are reported""" + import smartsim._core._cli.cli + import smartsim._core._cli.plugin + plugin_module = "notinstalled.Experiment_Overview" + bad_plugins = [ + lambda: MenuItemConfig( + "dashboard", + "Start the SmartSim dashboard", + plugin.dynamic_execute(plugin_module, "Dashboard!"), + is_plugin=True, + ) + ] + monkeypatch.setattr(smartsim._core._cli.cli, "plugins", bad_plugins) + # Coloredlogs doesn't play nice with caplog + monkeypatch.setattr( + smartsim._core._cli.plugin, + "_LOGGER", + _TEST_LOGGER, + ) + + smart_cli = cli.default_cli() + # execute with `dashboard` argument, expect failure to find dashboard plugin + build_args = ["smart", "dashboard", "-h"] + + rc = smart_cli.execute(build_args) + + assert plugin_module in caplog.text + assert "not found" in caplog.text + assert rc == os.EX_CONFIG @pytest.mark.parametrize( "command,mock_location,exp_output", @@ -348,7 +412,7 @@ def test_cli_default_cli(capsys): ) def test_cli_action(capsys, monkeypatch, command, mock_location, exp_output): """Ensure the default CLI executes the build action""" - def mock_execute(ns: argparse.Namespace): + def mock_execute(ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None): print(exp_output) return 0 @@ -400,7 +464,7 @@ def test_cli_optional_args(capsys, check_prop: str, exp_prop_val: t.Any): """Ensure the parser for a command handles expected optional arguments""" - def mock_execute(ns: argparse.Namespace): + def mock_execute(ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None): print(exp_output) return 0 @@ -418,9 +482,6 @@ def mock_execute(ns: argparse.Namespace): assert exp_output in captured.out # did the expected execution method occur? assert ret_val == 0 # is the retval is non-failure code? - - # is the value from the optional argument set in the parsed args? - assert smart_cli.args.__dict__[check_prop] == exp_prop_val else: with pytest.raises(SystemExit) as e: ret_val = smart_cli.execute(build_args) @@ -449,7 +510,7 @@ def test_cli_help_support(capsys, mock_output: str, exp_output: str): """Ensure the parser supports help optional for commands as expected""" - def mock_execute(ns: argparse.Namespace): + def mock_execute(ns: argparse.Namespace, unparsed: t.Optional[t.List[str]] = None): print(mock_output) return 0 @@ -487,7 +548,7 @@ def test_cli_invalid_optional_args(capsys, mock_location: str, exp_output: str): """Ensure the parser throws expected error for an invalid argument""" - def mock_execute(ns: argparse.Namespace): + def mock_execute(ns: argparse.Namespace, unparsed: t.Optional[t.List[str]] = None): print(exp_output) return 0 @@ -540,12 +601,12 @@ def test_cli_full_clean_execute(capsys, monkeypatch): exp_retval = 0 exp_output = "mocked-clean utility" - def mock_operation(*args, **kwargs) -> int: + # mock out the internal clean method so we don't actually delete anything + def mock_clean(core_path: pathlib.Path, _all: bool = False) -> int: print(exp_output) return exp_retval - - # mock out the internal clean method so we don't actually delete anything - monkeypatch.setattr(smartsim._core._cli.clean, "clean", mock_operation) + + monkeypatch.setattr(smartsim._core._cli.clean, "clean", mock_clean) command = "clean" cfg = MenuItemConfig(command, @@ -692,7 +753,7 @@ def mock_operation(*args, **kwargs) -> int: def _good_build(*args, **kwargs): - print("LGTM") + _TEST_LOGGER.info("LGTM") def _bad_build(*args, **kwargs): @@ -707,17 +768,17 @@ def _mock_temp_dir(*a, **kw): @pytest.mark.parametrize( "mock_verify_fn, expected_stdout, expected_retval", [ - pytest.param(_good_build, 'LGTM', 0, id="Configured Correctly"), + pytest.param(_good_build, 'LGTM', os.EX_OK, id="Configured Correctly"), pytest.param( _bad_build, "SmartSim failed to run a simple experiment", - 2, + os.EX_SOFTWARE, id="Configured Incorrectly", ) ], ) -def test_cli_build_test_execute( - capsys, +def test_cli_validation_test_execute( + caplog, monkeypatch, mock_verify_fn, expected_stdout, @@ -728,6 +789,7 @@ def test_cli_build_test_execute( checks that if at any point the test raises an exception an appropriate error code and error msg are returned. """ + caplog.set_level(logging.INFO) # Mock out the verification tests/avoid file system ops monkeypatch.setattr(smartsim._core._cli.validate, "test_install", mock_verify_fn) @@ -736,11 +798,11 @@ def test_cli_build_test_execute( "_VerificationTempDir", _mock_temp_dir, ) - # Coloredlogs doesn't play nice with capsys + # Coloredlogs doesn't play nice with caplog monkeypatch.setattr( - smartsim._core._cli.validate.logger, - "error", - print, + smartsim._core._cli.validate, + "logger", + _TEST_LOGGER, ) command = "validate" @@ -751,12 +813,8 @@ def test_cli_build_test_execute( menu = [cfg] smart_cli = cli.SmartCli(menu) - captured = capsys.readouterr() # throw away existing output - verify_args = ["smart", command] actual_retval = smart_cli.execute(verify_args) - captured = capsys.readouterr() # capture new output - - assert expected_stdout in captured.out + assert expected_stdout in caplog.text assert actual_retval == expected_retval diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index df5f65350..8cd8a575a 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -49,7 +49,8 @@ def test_macosx_warning(fileutils, coloutils): db_args = {"custom_pinning": [1]} db_type = "uds" # Test is insensitive to choice of db - exp = Experiment("colocated_model_defaults", launcher="local") + test_dir = fileutils.make_test_dir() + exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.warns( RuntimeWarning, match="CPU pinning is not supported on MacOSX. Ignoring pinning specification.", @@ -67,7 +68,8 @@ def test_unsupported_limit_app(fileutils, coloutils): db_args = {"limit_app_cpus": True} db_type = "uds" # Test is insensitive to choice of db - exp = Experiment("colocated_model_defaults", launcher="local") + test_dir = fileutils.make_test_dir() + exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.raises(SSUnsupportedError): coloutils.setup_test_colo( fileutils, @@ -84,7 +86,8 @@ def test_unsupported_custom_pinning(fileutils, coloutils, custom_pinning): db_type = "uds" # Test is insensitive to choice of db db_args = {"custom_pinning": custom_pinning} - exp = Experiment("colocated_model_defaults", launcher="local") + test_dir = fileutils.make_test_dir() + exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.raises(TypeError): coloutils.setup_test_colo( fileutils, @@ -120,7 +123,8 @@ def test_launch_colocated_model_defaults( db_args = {} - exp = Experiment("colocated_model_defaults", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment("colocated_model_defaults", launcher=launcher, exp_path=test_dir) colo_model = coloutils.setup_test_colo( fileutils, db_type, @@ -150,12 +154,12 @@ def test_launch_colocated_model_defaults( def test_launch_multiple_colocated_models( fileutils, coloutils, wlmutils, db_type, launcher="local" ): - """Test the concurrent launch of two models with a colocated database and local launcher - """ + """Test the concurrent launch of two models with a colocated database and local launcher""" db_args = {} - exp = Experiment("multi_colo_models", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment("multi_colo_models", launcher=launcher, exp_path=test_dir) colo_models = [ coloutils.setup_test_colo( fileutils, @@ -191,7 +195,10 @@ def test_launch_multiple_colocated_models( def test_colocated_model_disable_pinning( fileutils, coloutils, db_type, launcher="local" ): - exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment( + "colocated_model_pinning_auto_1cpu", launcher=launcher, exp_path=test_dir + ) db_args = { "db_cpus": 1, "custom_pinning": [], @@ -214,7 +221,10 @@ def test_colocated_model_disable_pinning( def test_colocated_model_pinning_auto_2cpu( fileutils, coloutils, db_type, launcher="local" ): - exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment( + "colocated_model_pinning_auto_2cpu", launcher=launcher, exp_path=test_dir + ) db_args = { "db_cpus": 2, @@ -245,7 +255,10 @@ def test_colocated_model_pinning_auto_2cpu( def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment( + "colocated_model_pinning_manual", launcher=launcher, exp_path=test_dir + ) db_args = {"db_cpus": 2, "custom_pinning": range(2)} @@ -267,7 +280,10 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher=" def test_colocated_model_pinning_list(fileutils, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment( + "colocated_model_pinning_manual", launcher=launcher, exp_path=test_dir + ) db_args = {"db_cpus": 1, "custom_pinning": [1]} diff --git a/tests/test_config.py b/tests/test_config.py index e33ea7dfd..2321e008f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -190,3 +190,57 @@ def test_redis_cli(): with pytest.raises(SSConfigError): config.database_cli os.environ.pop("REDIS_CLI_PATH") + + +@pytest.mark.parametrize( + "value, exp_result", [ + pytest.param("0", False, id="letter zero"), + pytest.param("1", True, id="letter one"), + pytest.param("-1", False, id="letter negative one"), + pytest.param(None, False, id="not in env"), + ] +) +def test_telemetry_flag(monkeypatch: pytest.MonkeyPatch, + value: t.Optional[str], + exp_result: bool): + if value is not None: + monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", value) + else: + monkeypatch.delenv("SMARTSIM_FLAG_TELEMETRY", raising=False) + config = Config() + assert config.telemetry_enabled == exp_result + +@pytest.mark.parametrize( + "value, exp_result", [ + pytest.param("1", 1, id="1"), + pytest.param("123", 123, id="123"), + pytest.param(None, 5, id="not in env"), + ] +) +def test_telemetry_frequency( + monkeypatch: pytest.MonkeyPatch, value: t.Optional[str], exp_result: int +): + if value is not None: + monkeypatch.setenv("SMARTSIM_TELEMETRY_FREQUENCY", value) + else: + monkeypatch.delenv("SMARTSIM_TELEMETRY_FREQUENCY", raising=False) + config = Config() + assert config.telemetry_frequency == exp_result + + +@pytest.mark.parametrize( + "value, exp_result", [ + pytest.param("30", 30, id="30"), + pytest.param("123", 123, id="123"), + pytest.param(None, 90, id="not in env"), + ] +) +def test_telemetry_cooldown( + monkeypatch: pytest.MonkeyPatch, value: t.Optional[str], exp_result: bool +): + if value is not None: + monkeypatch.setenv("SMARTSIM_TELEMETRY_COOLDOWN", value) + else: + monkeypatch.delenv("SMARTSIM_TELEMETRY_COOLDOWN", raising=False) + config = Config() + assert config.telemetry_cooldown == exp_result diff --git a/tests/test_configs/echo.py b/tests/test_configs/echo.py new file mode 100644 index 000000000..8a34a0b6f --- /dev/null +++ b/tests/test_configs/echo.py @@ -0,0 +1,42 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import time + + +def echo(message: str, sleep_time: int): + if sleep_time > 0: + time.sleep(sleep_time) + print(f"Echoing: {message}") + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("--message", type=str, default="Lorem ipsum") + parser.add_argument("--sleep_time", type=int, default=0) + args = parser.parse_args() + echo(args.message, args.sleep_time) diff --git a/tests/test_configs/printing_model.py b/tests/test_configs/printing_model.py new file mode 100644 index 000000000..044b2a03b --- /dev/null +++ b/tests/test_configs/printing_model.py @@ -0,0 +1,18 @@ +import time +import sys + + +def main() -> int: + print(";START;") + time.sleep(20) + print(";MID;") + print("This is an error msg", file=sys.stderr) + time.sleep(20) + print(";END;") + + print("yay!!") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_configs/telemetry/colocatedmodel.json b/tests/test_configs/telemetry/colocatedmodel.json new file mode 100644 index 000000000..f3e93ac76 --- /dev/null +++ b/tests/test_configs/telemetry/colocatedmodel.json @@ -0,0 +1,69 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "002816b", + "timestamp": 1699037041106269774, + "model": [ + { + "name": "colocated_model", + "path": "/tmp/my-exp/colocated_model", + "exe_args": [ + "/path/to/my/script.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": {} + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": { + "settings": { + "unix_socket": "/tmp/redis.socket", + "socket_permissions": 755, + "port": 0, + "cpus": 1, + "custom_pinning": "0", + "debug": false, + "db_identifier": "", + "rai_args": { + "threads_per_queue": null, + "inter_op_parallelism": null, + "intra_op_parallelism": null + }, + "extra_db_args": {} + }, + "scripts": [], + "models": [] + }, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_ensemble/002816b/model/colocated_model", + "step_id": "4139111.21", + "task_id": "21529", + "managed": true + }, + "out_file": "/tmp/my-exp/colocated_model/colocated_model.out", + "err_file": "/tmp/my-exp/colocated_model/colocated_model.err" + } + ], + "orchestrator": [], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/db_and_model.json b/tests/test_configs/telemetry/db_and_model.json new file mode 100644 index 000000000..58c1c841a --- /dev/null +++ b/tests/test_configs/telemetry/db_and_model.json @@ -0,0 +1,86 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "2ca19ad", + "timestamp": 1699038647234488933, + "model": [], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_0", + "hostname": "10.128.0.4", + "port": 6780, + "cluster": false, + "conf_file": null, + "out_file": "/path/to/some/file.out", + "err_file": "/path/to/some/file.err", + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", + "step_id": "4139111.27", + "task_id": "1452", + "managed": true + } + } + ] + } + ], + "ensemble": [] + }, + { + "run_id": "4b5507a", + "timestamp": 1699038661491043211, + "model": [ + { + "name": "perroquet", + "path": "/tmp/my-exp/perroquet", + "exe_args": [ + "/path/to/my/script.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", + "step_id": "4139111.28", + "task_id": "2929", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet/perroquet.out", + "err_file": "/tmp/my-exp/perroquet/perroquet.err" + } + ], + "orchestrator": [], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/db_and_model_1run.json b/tests/test_configs/telemetry/db_and_model_1run.json new file mode 100644 index 000000000..44e32bfe4 --- /dev/null +++ b/tests/test_configs/telemetry/db_and_model_1run.json @@ -0,0 +1,79 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "4b5507a", + "timestamp": 1699038661491043211, + "model": [ + { + "name": "perroquet", + "path": "/tmp/my-exp/perroquet", + "exe_args": [ + "/path/to/my/script.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", + "step_id": "4139111.28", + "task_id": "2929", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet/perroquet.out", + "err_file": "/tmp/my-exp/perroquet/perroquet.err" + } + ], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_0", + "hostname": "10.128.0.4", + "port": 6780, + "cluster": false, + "conf_file": null, + "out_file": "/path/to/some/file.out", + "err_file": "/path/to/some/file.err", + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", + "step_id": "4139111.27", + "task_id": "1452", + "managed": true + } + } + ] + } + ], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/ensembles.json b/tests/test_configs/telemetry/ensembles.json new file mode 100644 index 000000000..841324ec6 --- /dev/null +++ b/tests/test_configs/telemetry/ensembles.json @@ -0,0 +1,329 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/home/someuser/code/ss/my-exp", + "launcher": "Local" + }, + "runs": [ + { + "run_id": "d041b90", + "timestamp": 1698679830384608928, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" + ], + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_0", + "step_id": null, + "task_id": "88118", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_0.out", + "err_file": "/home/someuser/code/ss/my-ens_0.err" + }, + { + "name": "my-ens_1", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_1", + "step_id": null, + "task_id": "88131", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_1.out", + "err_file": "/home/someuser/code/ss/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_2", + "step_id": null, + "task_id": "88146", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_2.out", + "err_file": "/home/someuser/code/ss/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_3", + "step_id": null, + "task_id": "88170", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_3.out", + "err_file": "/home/someuser/code/ss/my-ens_3.err" + }, + { + "name": "my-ens_4", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_4", + "step_id": null, + "task_id": "88178", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_4.out", + "err_file": "/home/someuser/code/ss/my-ens_4.err" + }, + { + "name": "my-ens_5", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_5", + "step_id": null, + "task_id": "88193", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_5.out", + "err_file": "/home/someuser/code/ss/my-ens_5.err" + }, + { + "name": "my-ens_6", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_6", + "step_id": null, + "task_id": "88221", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_6.out", + "err_file": "/home/someuser/code/ss/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_7", + "step_id": null, + "task_id": "88241", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_7.out", + "err_file": "/home/someuser/code/ss/my-ens_7.err" + } + ] + } + ] + } + ] + } diff --git a/tests/test_configs/telemetry/serialmodels.json b/tests/test_configs/telemetry/serialmodels.json new file mode 100644 index 000000000..40337eceb --- /dev/null +++ b/tests/test_configs/telemetry/serialmodels.json @@ -0,0 +1,186 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "8c0fbb1", + "timestamp": 1699037881502730708, + "model": [ + { + "name": "perroquet_0", + "path": "/tmp/my-exp/perroquet_0", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_0", + "step_id": "4139111.22", + "task_id": "17966", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_0/perroquet_0.out", + "err_file": "/tmp/my-exp/perroquet_0/perroquet_0.err" + }, + { + "name": "perroquet_1", + "path": "/tmp/my-exp/perroquet_1", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_1", + "step_id": "4139111.23", + "task_id": "18100", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_1/perroquet_1.out", + "err_file": "/tmp/my-exp/perroquet_1/perroquet_1.err" + }, + { + "name": "perroquet_2", + "path": "/tmp/my-exp/perroquet_2", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_2", + "step_id": "4139111.24", + "task_id": "18159", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_2/perroquet_2.out", + "err_file": "/tmp/my-exp/perroquet_2/perroquet_2.err" + }, + { + "name": "perroquet_3", + "path": "/tmp/my-exp/perroquet_3", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_3", + "step_id": "4139111.25", + "task_id": "18499", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_3/perroquet_3.out", + "err_file": "/tmp/my-exp/perroquet_3/perroquet_3.err" + }, + { + "name": "perroquet_4", + "path": "/tmp/my-exp/perroquet_4", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_4", + "step_id": "4139111.26", + "task_id": "18832", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_4/perroquet_4.out", + "err_file": "/tmp/my-exp/perroquet_4/perroquet_4.err" + } + ], + "orchestrator": [], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/telemetry.json b/tests/test_configs/telemetry/telemetry.json new file mode 100644 index 000000000..a380bc5fb --- /dev/null +++ b/tests/test_configs/telemetry/telemetry.json @@ -0,0 +1,946 @@ +{ + "experiment": { + "name": "my-exp", + "path": "/path/to/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "d999ad89-020f-4e6a-b834-dbd88658ce84", + "timestamp": 1697824072792854287, + "model": [ + { + "name": "my-model", + "path": "/path/to/my-exp/my-model", + "exe_args": [ + "hello", + "world" + ], + "run_settings": { + "exe": [ + "/usr/bin/echo" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": { + "settings": { + "port": 5757, + "ifname": "lo", + "cpus": 1, + "custom_pinning": "0", + "debug": false, + "db_identifier": "COLO", + "rai_args": { + "threads_per_queue": null, + "inter_op_parallelism": null, + "intra_op_parallelism": null + }, + "extra_db_args": {} + }, + "scripts": [], + "models": [ + { + "cnn": { + "backend": "TORCH", + "device": "CPU" + } + } + ] + }, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d999ad89-020f-4e6a-b834-dbd88658ce84/model/my-model", + "step_id": "4121050.30", + "task_id": "25230", + "managed": true + }, + "out_file": "/path/to/my-exp/my-model/my-model.out", + "err_file": "/path/to/my-exp/my-model/my-model.err" + } + ], + "orchestrator": [], + "ensemble": [] + }, + { + "run_id": "fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa", + "timestamp": 1697824102122439975, + "model": [], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_1", + "hostname": "10.128.0.70", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_1-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } + }, + { + "name": "orchestrator_2", + "hostname": "10.128.0.71", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_2-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } + }, + { + "name": "orchestrator_0", + "hostname": "10.128.0.69", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_0-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } + } + ] + } + ], + "ensemble": [] + }, + { + "run_id": "d65ae1df-cb5e-45d9-ab09-6fa641755997", + "timestamp": 1697824127962219505, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" + ], + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/path/to/my-exp/my-ens/my-ens_0", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_0", + "step_id": "4121050.32", + "task_id": "25639", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" + }, + { + "name": "my-ens_1", + "path": "/path/to/my-exp/my-ens/my-ens_1", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_1", + "step_id": "4121050.33", + "task_id": "25768", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/path/to/my-exp/my-ens/my-ens_2", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_2", + "step_id": "4121050.34", + "task_id": "25817", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/path/to/my-exp/my-ens/my-ens_3", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_3", + "step_id": "4121050.35", + "task_id": "25837", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" + }, + { + "name": "my-ens_4", + "path": "/path/to/my-exp/my-ens/my-ens_4", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_4", + "step_id": "4121050.36", + "task_id": "25872", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" + }, + { + "name": "my-ens_5", + "path": "/path/to/my-exp/my-ens/my-ens_5", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_5", + "step_id": "4121050.37", + "task_id": "25930", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" + }, + { + "name": "my-ens_6", + "path": "/path/to/my-exp/my-ens/my-ens_6", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_6", + "step_id": "4121050.38", + "task_id": "25945", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/path/to/my-exp/my-ens/my-ens_7", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_7", + "step_id": "4121050.39", + "task_id": "25967", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" + } + ] + } + ] + }, + { + "run_id": "e41f8e17-c4b2-441d-adf9-707443ee2c72", + "timestamp": 1697835227560376025, + "model": [ + { + "name": "my-model", + "path": "/path/to/my-exp/my-model", + "exe_args": [ + "hello", + "world" + ], + "run_settings": { + "exe": [ + "/usr/bin/echo" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": { + "settings": { + "port": 5757, + "ifname": "lo", + "cpus": 1, + "custom_pinning": "0", + "debug": false, + "db_identifier": "COLO", + "rai_args": { + "threads_per_queue": null, + "inter_op_parallelism": null, + "intra_op_parallelism": null + }, + "extra_db_args": {} + }, + "scripts": [], + "models": [ + { + "cnn": { + "backend": "TORCH", + "device": "CPU" + } + } + ] + }, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/e41f8e17-c4b2-441d-adf9-707443ee2c72/model/my-model", + "step_id": "4121904.0", + "task_id": "28277", + "managed": true + }, + "out_file": "/path/to/my-exp/my-model/my-model.out", + "err_file": "/path/to/my-exp/my-model/my-model.err" + } + ], + "orchestrator": [], + "ensemble": [] + }, + { + "run_id": "b33a5d27-6822-4795-8e0e-cfea18551fa4", + "timestamp": 1697835261956135240, + "model": [], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_0", + "hostname": "10.128.0.2", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_0-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true + } + }, + { + "name": "orchestrator_2", + "hostname": "10.128.0.4", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_2-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true + } + }, + { + "name": "orchestrator_1", + "hostname": "10.128.0.3", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_1-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true + } + } + ] + } + ], + "ensemble": [] + }, + { + "run_id": "45772df2-fd80-43fd-adf0-d5e319870182", + "timestamp": 1697835287798613875, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" + ], + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/path/to/my-exp/my-ens/my-ens_0", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_0", + "step_id": "4121904.2", + "task_id": "28333", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" + }, + { + "name": "my-ens_1", + "path": "/path/to/my-exp/my-ens/my-ens_1", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_1", + "step_id": "4121904.3", + "task_id": "28342", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/path/to/my-exp/my-ens/my-ens_2", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_2", + "step_id": "4121904.4", + "task_id": "28353", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/path/to/my-exp/my-ens/my-ens_3", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_3", + "step_id": "4121904.5", + "task_id": "28362", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" + }, + { + "name": "my-ens_4", + "path": "/path/to/my-exp/my-ens/my-ens_4", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_4", + "step_id": "4121904.6", + "task_id": "28371", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" + }, + { + "name": "my-ens_5", + "path": "/path/to/my-exp/my-ens/my-ens_5", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_5", + "step_id": "4121904.7", + "task_id": "28380", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" + }, + { + "name": "my-ens_6", + "path": "/path/to/my-exp/my-ens/my-ens_6", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_6", + "step_id": "4121904.8", + "task_id": "28389", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/path/to/my-exp/my-ens/my-ens_7", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_7", + "step_id": "4121904.9", + "task_id": "28398", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" + } + ] + } + ] + } + ] + } + diff --git a/tests/test_controller.py b/tests/test_controller.py new file mode 100644 index 000000000..c00adce91 --- /dev/null +++ b/tests/test_controller.py @@ -0,0 +1,68 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import pathlib + +from smartsim._core.control.controller import Controller +from smartsim.settings.slurmSettings import SbatchSettings, SrunSettings +from smartsim._core.launcher.step import Step +from smartsim.entity.ensemble import Ensemble +from smartsim.database.orchestrator import Orchestrator + +controller = Controller() + +rs = SrunSettings('echo', ['spam', 'eggs']) +bs = SbatchSettings() + +ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) +orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") + +class MockStep(Step): + @staticmethod + def _create_unique_name(name): + return name + + def add_to_batch(self, step): + ... + + def get_launch_cmd(self): + return [] + +@pytest.mark.parametrize("collection", [ + pytest.param(ens, id="Ensemble"), + pytest.param(orc, id="Database"), +]) +def test_controller_batch_step_creation_preserves_entity_order(collection, monkeypatch): + monkeypatch.setattr(controller._launcher, "create_step", + lambda name, path, settings: MockStep(name, path, settings)) + entity_names = [x.name for x in collection.entities] + assert len(entity_names) == len(set(entity_names)) + _, steps = controller._create_batch_job_step(collection, pathlib.Path("mock/exp/path")) + assert entity_names == [step.name for step in steps] + + diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index 30d9870cf..a40ccdf66 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -100,7 +100,7 @@ def test_wrong_orchestrator(wlmutils): cont = Controller(launcher="local") manifest = Manifest(orc) with pytest.raises(SmartSimError): - cont._launch(manifest) + cont._launch("exp_name", "exp_path", manifest) def test_bad_orc_checkpoint(): diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index bf5604c41..273c6de20 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -51,8 +51,8 @@ def test_parse_db_host_error(): def test_hosts(fileutils, wlmutils): exp_name = "test_hosts" - exp = Experiment(exp_name) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, exp_path=test_dir) orc = Orchestrator(port=wlmutils.get_test_port(), interface="lo", launcher="local") orc.set_path(test_dir) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index edc69527c..de6ab37e3 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -26,10 +26,13 @@ import pytest +import os + from smartsim import Experiment from smartsim.entity import Model from smartsim.error import SmartSimError from smartsim.settings import RunSettings +from smartsim._core.config import CONFIG # The tests in this file belong to the slow_tests group pytestmark = pytest.mark.slow_tests @@ -111,8 +114,8 @@ def test_bad_ensemble_init_no_rs_bs(): def test_stop_entity(fileutils): exp_name = "test_stop_entity" - exp = Experiment(exp_name) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, exp_path=test_dir) m = exp.create_model("model", path=test_dir, run_settings=RunSettings("sleep", "5")) exp.start(m, block=False) assert exp.finished(m) == False @@ -123,8 +126,8 @@ def test_stop_entity(fileutils): def test_poll(fileutils): # Ensure that a SmartSimError is not raised exp_name = "test_exp_poll" - exp = Experiment(exp_name) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, exp_path=test_dir) model = exp.create_model( "model", path=test_dir, run_settings=RunSettings("sleep", "5") ) @@ -135,8 +138,8 @@ def test_poll(fileutils): def test_summary(fileutils): exp_name = "test_exp_summary" - exp = Experiment(exp_name) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, exp_path=test_dir) m = exp.create_model( "model", path=test_dir, run_settings=RunSettings("echo", "Hello") ) @@ -156,6 +159,7 @@ def test_summary(fileutils): assert 0 == int(row["RunID"]) assert 0 == int(row["Returncode"]) + def test_launcher_detection(wlmutils, monkeypatch): if wlmutils.get_test_launcher() == "pals": pytest.skip(reason="Launcher detection cannot currently detect pbs vs pals") @@ -165,3 +169,16 @@ def test_launcher_detection(wlmutils, monkeypatch): exp = Experiment("test-launcher-detection", launcher="auto") assert exp._launcher == wlmutils.get_test_launcher() + + +def test_enable_disable_telemtery(monkeypatch): + # TODO: Currently these are implemented by setting an environment variable + # so that ALL experiments instanced in a driver script will begin + # producing telemetry data. In the future it is planned to have this + # work on a "per-instance" basis + monkeypatch.setattr(os, "environ", {}) + exp = Experiment("my-exp") + exp.enable_telemetry() + assert CONFIG.telemetry_enabled + exp.disable_telemetry() + assert not CONFIG.telemetry_enabled diff --git a/tests/test_generator.py b/tests/test_generator.py index 91f242ab1..a99d5795d 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -264,10 +264,8 @@ def test_multiple_tags(fileutils): exp.start(parameterized_model, block=True) with open(osp.join(parameterized_model.path, "multi-tags.out")) as f: - line = f.readline() - assert ( - line.strip() == "My two parameters are 6379 and unbreakable_password, OK?" - ) + log_content = f.read() + assert "My two parameters are 6379 and unbreakable_password, OK?" in log_content def test_generation_log(fileutils): diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 55dd7cbe3..ca145042e 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -27,6 +27,7 @@ import pytest from smartsim._core.utils.helpers import cat_arg_and_value +from smartsim._core.utils import helpers # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -47,3 +48,17 @@ def test_single_char_concat(): def test_fallthrough_concat(): result = cat_arg_and_value("xx", "FOO") # <-- no dashes, > 1 char assert result == "--xx=FOO" + +def test_encode_decode_cmd_round_trip(): + orig_cmd = ["this", "is", "a", "cmd"] + decoded_cmd = helpers.decode_cmd(helpers.encode_cmd(orig_cmd)) + assert orig_cmd == decoded_cmd + assert orig_cmd is not decoded_cmd + +def test_encode_raises_on_empty(): + with pytest.raises(ValueError): + helpers.encode_cmd([]) + +def test_decode_raises_on_empty(): + with pytest.raises(ValueError): + helpers.decode_cmd("") diff --git a/tests/test_indirect.py b/tests/test_indirect.py new file mode 100644 index 000000000..09f630304 --- /dev/null +++ b/tests/test_indirect.py @@ -0,0 +1,195 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import pathlib +import psutil +import pytest +import sys +import uuid + +from smartsim._core.entrypoints.indirect import get_parser, cleanup, get_ts, main +from smartsim._core.utils.serialize import TELMON_SUBDIR, MANIFEST_FILENAME +from smartsim._core.utils.helpers import encode_cmd + +ALL_ARGS = {"+command", "+entity_type", "+telemetry_dir", "+output_file", "+error_file", "+working_dir"} + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +@pytest.mark.parametrize( + ["cmd", "missing"], + [ + pytest.param("indirect.py", {"+name", "+command", "+entity_type", "+telemetry_dir", "+working_dir"}, id="no args"), + pytest.param("indirect.py -c echo +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="cmd typo"), + pytest.param("indirect.py -t orchestrator +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="etype typo"), + pytest.param("indirect.py -d /foo/bar +entity_type ttt +command ccc +output_file ooo +working_dir www +error_file eee", {"+telemetry_dir"}, id="dir typo"), + pytest.param("indirect.py +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="no cmd"), + pytest.param("indirect.py +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="no etype"), + pytest.param("indirect.py +command ccc +entity_type ttt +output_file ooo +working_dir www +error_file eee", {"+telemetry_dir"}, id="no dir"), + ] +) +def test_parser(capsys, cmd, missing): + """Test that the parser reports any missing required arguments""" + parser = get_parser() + + args = cmd.split() + + captured = capsys.readouterr() # throw away existing output + with pytest.raises(SystemExit) as ex: + ns = parser.parse_args(args) + + captured = capsys.readouterr() + assert "the following arguments are required" in captured.err + for arg in missing: + assert arg in captured.err + + expected = ALL_ARGS - missing + msg_tuple = captured.err.split("the following arguments are required: ") + if len(msg_tuple) < 2: + assert False, "error message indicates no missing arguments" + + actual_missing = msg_tuple[1].strip() + for exp in expected: + assert f"{exp}/" not in actual_missing + + +def test_cleanup(capsys, monkeypatch): + """Ensure cleanup attempts termination of correct process""" + mock_pid = 123 + create_msg = "creating: {0}" + term_msg = "terminating: {0}" + + class MockProc: + def __init__(self, pid: int): + print(create_msg.format(pid)) + def terminate(self): + print(term_msg.format(mock_pid)) + + captured = capsys.readouterr() # throw away existing output + + with monkeypatch.context() as ctx: + ctx.setattr('psutil.pid_exists', lambda pid: True) + ctx.setattr('psutil.Process', MockProc) + ctx.setattr('smartsim._core.entrypoints.indirect.STEP_PID', mock_pid) + cleanup() + + captured = capsys.readouterr() + assert create_msg.format(mock_pid) in captured.out + assert term_msg.format(mock_pid) in captured.out + + +def test_cleanup_late(capsys, monkeypatch): + """Ensure cleanup exceptions are swallowed if a process is already terminated""" + mock_pid = 123 + create_msg = "creating: {0}" + term_msg = "terminating: {0}" + + class MockMissingProc: + def __init__(self, pid: int) -> None: + print(create_msg.format(mock_pid)) + raise psutil.NoSuchProcess(pid) + def terminate(self) -> None: + print(term_msg.format(mock_pid)) + + captured = capsys.readouterr() # throw away existing output + + with monkeypatch.context() as ctx: + ctx.setattr('psutil.pid_exists', lambda pid: True) + ctx.setattr('psutil.Process', MockMissingProc) + ctx.setattr('smartsim._core.entrypoints.indirect.STEP_PID', mock_pid) + cleanup() + + captured = capsys.readouterr() + assert create_msg.format(mock_pid) in captured.out + + +def test_ts(): + """Ensure expected output type""" + ts = get_ts() + assert isinstance(ts, int) + + +def test_indirect_main_dir_check(fileutils): + """Ensure that the proxy validates the test directory exists""" + test_dir = fileutils.make_test_dir() + exp_dir = pathlib.Path(test_dir) + + cmd = ["echo", "unit-test"] + encoded_cmd = encode_cmd(cmd) + + status_path = exp_dir / TELMON_SUBDIR + + # show that a missing status_path is created when missing + main(encoded_cmd, "application", exp_dir, status_path) + + assert status_path.exists() + + +def test_indirect_main_cmd_check(capsys, fileutils, monkeypatch): + """Ensure that the proxy validates the cmd is not empty or whitespace-only""" + test_dir = fileutils.make_test_dir() + exp_dir = pathlib.Path(test_dir) + + captured = capsys.readouterr() # throw away existing output + with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: + ctx.setattr('smartsim._core.entrypoints.indirect.logger.error', print) + _ = main("", "application", exp_dir, exp_dir / TELMON_SUBDIR) + + captured = capsys.readouterr() + assert "Invalid cmd supplied" in ex.value.args[0] + + # test with non-emptystring cmd + with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: + ctx.setattr('smartsim._core.entrypoints.indirect.logger.error', print) + _ = main(" \n \t ", "application", exp_dir, exp_dir / TELMON_SUBDIR) + + captured = capsys.readouterr() + assert "Invalid cmd supplied" in ex.value.args[0] + + +def test_complete_process(fileutils): + """Ensure the happy-path completes and returns a success return code""" + script = fileutils.get_test_conf_path("sleep.py") + + test_dir = fileutils.make_test_dir() + exp_dir = pathlib.Path(test_dir) + + raw_cmd = f"{sys.executable} {script} --time=1" + cmd = encode_cmd(raw_cmd.split()) + + rc = main(cmd, "application", exp_dir, exp_dir / TELMON_SUBDIR) + assert rc == 0 + + assert exp_dir.exists() + + # NOTE: don't have a manifest so we're falling back to default event path + data_dir = exp_dir / TELMON_SUBDIR + start_events = list(data_dir.rglob("start.json")) + stop_events = list(data_dir.rglob("stop.json")) + + assert start_events + assert stop_events diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index 89659db42..c40db4d18 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -48,8 +48,8 @@ def test_unsupported_run_settings(): def test_model_failure(fileutils): exp_name = "test-model-failure" - exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("bad.py") settings = RunSettings("python", f"{script} --time=3") @@ -64,8 +64,8 @@ def test_model_failure(fileutils): def test_orchestrator_relaunch(fileutils, wlmutils): """Test when users try to launch second orchestrator""" exp_name = "test-orc-on-relaunch" - exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) orc = Orchestrator(port=wlmutils.get_test_port()) orc.set_path(test_dir) diff --git a/tests/test_local_launch.py b/tests/test_local_launch.py index 30389a0b0..9cacd810e 100644 --- a/tests/test_local_launch.py +++ b/tests/test_local_launch.py @@ -38,8 +38,8 @@ def test_models(fileutils): exp_name = "test-models-local-launch" - exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -54,8 +54,8 @@ def test_models(fileutils): def test_ensemble(fileutils): exp_name = "test-ensemble-launch" - exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_local_multi_run.py b/tests/test_local_multi_run.py index 7f1f5c624..5edaff1c9 100644 --- a/tests/test_local_multi_run.py +++ b/tests/test_local_multi_run.py @@ -38,8 +38,8 @@ def test_models(fileutils): exp_name = "test-models-local-launch" - exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/test_local_restart.py b/tests/test_local_restart.py index c9078f125..54884a3f4 100644 --- a/tests/test_local_restart.py +++ b/tests/test_local_restart.py @@ -39,8 +39,8 @@ def test_restart(fileutils): exp_name = "test-models-local-restart" - exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -59,8 +59,8 @@ def test_restart(fileutils): def test_ensemble(fileutils): exp_name = "test-ensemble-restart" - exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_manifest.py b/tests/test_manifest.py index f68219c73..5bb373fc1 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -26,11 +26,17 @@ from copy import deepcopy +import os.path import pytest from smartsim import Experiment -from smartsim._core.control import Manifest +from smartsim._core.control.manifest import ( + Manifest, + LaunchedManifest, + LaunchedManifestBuilder, + _LaunchedManifestMetadata as LaunchedManifestMetadata, +) from smartsim.database import Orchestrator from smartsim.error import SmartSimError from smartsim.settings import RunSettings @@ -48,7 +54,6 @@ model_2 = exp.create_model("model_1", run_settings=rs) ensemble = exp.create_ensemble("ensemble", run_settings=rs, replicas=1) - orc = Orchestrator() orc_1 = deepcopy(orc) orc_1.name = "orc2" @@ -97,3 +102,67 @@ class Person: p = Person() with pytest.raises(TypeError): _ = Manifest(p) + +def test_launched_manifest_transform_data(): + models = [(model, 1), (model_2, 2)] + ensembles = [(ensemble, [(m, i) for i, m in enumerate(ensemble.entities)])] + dbs = [(orc, [(n, i) for i, n in enumerate(orc.entities)])] + launched = LaunchedManifest( + metadata=LaunchedManifestMetadata("name", "path", "launcher", "run_id"), + models=models, + ensembles=ensembles, + databases=dbs, + ) + transformed = launched.map(lambda x: str(x)) + assert transformed.models == tuple((m, str(i)) for m, i in models) + assert transformed.ensembles[0][1] == tuple((m, str(i)) for m, i in ensembles[0][1]) + assert transformed.databases[0][1] == tuple((n, str(i)) for n, i in dbs[0][1]) + + +def test_launched_manifest_builder_correctly_maps_data(): + lmb = LaunchedManifestBuilder("name", "path", "launcher name") + lmb.add_model(model, 1) + lmb.add_model(model_2, 1) + lmb.add_ensemble(ensemble, [i for i in range(len(ensemble.entities))]) + lmb.add_database(orc, [i for i in range(len(orc.entities))]) + + manifest = lmb.finalize() + assert len(manifest.models) == 2 + assert len(manifest.ensembles) == 1 + assert len(manifest.databases) == 1 + + +def test_launced_manifest_builder_raises_if_lens_do_not_match(): + lmb = LaunchedManifestBuilder("name", "path", "launcher name") + with pytest.raises(ValueError): + lmb.add_ensemble(ensemble, list(range(123))) + with pytest.raises(ValueError): + lmb.add_database(orc, list(range(123))) + + +def test_launched_manifest_builer_raises_if_attaching_data_to_empty_collection( + monkeypatch +): + lmb = LaunchedManifestBuilder("name", "path", "launcher") + monkeypatch.setattr(ensemble, "entities", []) + with pytest.raises(ValueError): + lmb.add_ensemble(ensemble, []) + + +def test_lmb_and_launched_manifest_have_same_paths_for_launched_metadata(): + exp_path = "/path/to/some/exp" + lmb = LaunchedManifestBuilder("exp_name", exp_path, "launcher") + manifest = lmb.finalize() + assert lmb.exp_telemetry_subdirectory == manifest.metadata.exp_telemetry_subdirectory + assert lmb.run_telemetry_subdirectory == manifest.metadata.run_telemetry_subdirectory + assert os.path.commonprefix([ + manifest.metadata.run_telemetry_subdirectory, + manifest.metadata.exp_telemetry_subdirectory, + manifest.metadata.manifest_file_path, + exp_path, + ]) == exp_path + assert os.path.commonprefix([ + manifest.metadata.run_telemetry_subdirectory, + manifest.metadata.exp_telemetry_subdirectory, + manifest.metadata.manifest_file_path, + ]) == str(manifest.metadata.exp_telemetry_subdirectory) diff --git a/tests/test_model.py b/tests/test_model.py index 103e8a09c..76af50b54 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -28,6 +28,7 @@ from smartsim import Experiment from smartsim._core.launcher.step import SbatchStep, SrunStep +from smartsim._core.control.manifest import LaunchedManifestBuilder from smartsim.entity import Ensemble, Model from smartsim.error import EntityExistsError, SSUnsupportedError from smartsim.settings import RunSettings, SbatchSettings, SrunSettings @@ -88,8 +89,10 @@ def monkeypatch_exp_controller(monkeypatch): def _monkeypatch_exp_controller(exp): entity_steps = [] - def start_wo_job_manager(self, manifest, block=True, kill_on_interrupt=True): - self._launch(manifest) + def start_wo_job_manager(self, exp_name, exp_path, manifest, + block=True, kill_on_interrupt=True): + self._launch(exp_name, exp_path, manifest) + return LaunchedManifestBuilder("name", "path", "launcher").finalize() def launch_step_nop(self, step, entity): entity_steps.append((step, entity)) diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 9b50f1e80..7bea4e0c8 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -59,7 +59,7 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create regular database orc = exp.create_database( @@ -129,7 +129,7 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create run settings colo_settings = exp.create_run_settings("python", test_script) @@ -172,7 +172,7 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ exp.stop(orc) -def test_db_identifier_standard_twice_not_unique(wlmutils): +def test_db_identifier_standard_twice_not_unique(wlmutils, fileutils): """Test uniqueness of db_identifier several calls to create_database, with non unique names, checking error is raised before exp start is called""" @@ -183,9 +183,10 @@ def test_db_identifier_standard_twice_not_unique(wlmutils): test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() + test_dir = fileutils.make_test_dir() # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # CREATE DATABASE with db_identifier orc = exp.create_database( @@ -297,7 +298,9 @@ def test_multidb_colo_once(fileutils, wlmutils, coloutils, db_type): test_script = fileutils.get_test_conf_path("smartredis/dbid.py") # start a new Experiment for this section - exp = Experiment("test_multidb_colo_once", launcher=test_launcher) + exp = Experiment("test_multidb_colo_once", + launcher=test_launcher, + exp_path=test_dir) # create run settings run_settings = exp.create_run_settings("python", test_script) @@ -463,8 +466,8 @@ def test_launch_cluster_orc_single_dbid(fileutils, wlmutils): exp_name = "test_launch_cluster_orc_single_dbid" launcher = wlmutils.get_test_launcher() - exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index e61139931..586dbcefa 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -70,8 +70,8 @@ def test_inactive_orc_get_address(): def test_orc_active_functions(fileutils, wlmutils): exp_name = "test_orc_active_functions" - exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) db = Orchestrator(port=wlmutils.get_test_port()) db.set_path(test_dir) @@ -98,8 +98,8 @@ def test_orc_active_functions(fileutils, wlmutils): def test_multiple_interfaces(fileutils, wlmutils): exp_name = "test_multiple_interfaces" - exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) net_if_addrs = psutil.net_if_addrs() net_if_addrs = [ diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index 1edb183fa..ce35e135f 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -30,6 +30,9 @@ import shutil import sys +import smartsim._core.config.config +from smartsim.error import SSUnsupportedError + from smartsim.settings import PalsMpiexecSettings from smartsim._core.launcher import PBSLauncher from smartsim._core.launcher.step.mpiStep import MpiexecStep @@ -41,6 +44,15 @@ default_exe = sys.executable default_kwargs = {"fail_if_missing_exec": False} + +@pytest.fixture(autouse=True) +def turn_off_telemetry_indirect(monkeypatch): + monkeypatch.setattr( + smartsim._core.config.config.Config, + "telemetry_enabled", False) + yield + + # Uncomment when # @pytest.mark.parametrize( # "function_name",[ @@ -56,6 +68,7 @@ # with pytest.raises(SSUnsupportedError): # func(None) + def test_affinity_script(): settings = PalsMpiexecSettings(default_exe, **default_kwargs) settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2) diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index ecda7f1e6..60105da41 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -45,8 +45,8 @@ def test_local_orchestrator(fileutils, wlmutils): """Test launching orchestrator locally""" global first_dir exp_name = "test-orc-launch-local" - exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) first_dir = test_dir orc = Orchestrator(port=wlmutils.get_test_port()) @@ -61,12 +61,13 @@ def test_local_orchestrator(fileutils, wlmutils): exp._control._launcher.task_manager.actively_monitoring = False -def test_reconnect_local_orc(): +def test_reconnect_local_orc(fileutils): """Test reconnecting to orchestrator from first experiment""" global first_dir # start new experiment exp_name = "test-orc-local-reconnect-2nd" - exp_2 = Experiment(exp_name, launcher="local") + test_dir = fileutils.make_test_dir() + exp_2 = Experiment(exp_name, launcher="local", exp_path=test_dir) checkpoint = osp.join(first_dir, "smartsim_db.dat") reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) diff --git a/tests/test_serialize.py b/tests/test_serialize.py new file mode 100644 index 000000000..2010b77e2 --- /dev/null +++ b/tests/test_serialize.py @@ -0,0 +1,175 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import logging +from pathlib import Path +import json + +from smartsim import Experiment +from smartsim.database.orchestrator import Orchestrator +from smartsim._core.utils import serialize +from smartsim._core._cli import utils +from smartsim._core.control.manifest import LaunchedManifestBuilder +import smartsim._core.config.config + +_REL_MANIFEST_PATH = f"{serialize.TELMON_SUBDIR}/{serialize.MANIFEST_FILENAME}" +_CFG_TM_ENABLED_ATTR = "telemetry_enabled" + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + +@pytest.fixture(autouse=True) +def turn_on_tm(monkeypatch): + monkeypatch.setattr( + smartsim._core.config.config.Config, + _CFG_TM_ENABLED_ATTR, + property(lambda self: True)) + yield + + +def test_serialize_creates_a_manifest_json_file_if_dne(fileutils): + test_dir = fileutils.get_test_dir() + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + serialize.save_launch_manifest(lmb.finalize()) + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + + assert manifest_json.is_file() + with open(manifest_json, 'r') as f: + manifest = json.load(f) + assert manifest["experiment"]["name"] == "exp" + assert manifest["experiment"]["launcher"] == "launcher" + assert isinstance(manifest["runs"], list) + assert len(manifest["runs"]) == 1 + + +def test_serialize_does_not_write_manifest_json_if_telemetry_monitor_is_off( + fileutils, monkeypatch +): + monkeypatch.setattr( + smartsim._core.config.config.Config, + _CFG_TM_ENABLED_ATTR, + property(lambda self: False)) + test_dir = fileutils.get_test_dir() + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + serialize.save_launch_manifest(lmb.finalize()) + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + assert not manifest_json.exists() + + +def test_serialize_appends_a_manifest_json_exists(fileutils): + test_dir = fileutils.get_test_dir() + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + serialize.save_launch_manifest( + LaunchedManifestBuilder("exp", test_dir, "launcher").finalize()) + serialize.save_launch_manifest( + LaunchedManifestBuilder("exp", test_dir, "launcher").finalize()) + serialize.save_launch_manifest( + LaunchedManifestBuilder("exp", test_dir, "launcher").finalize()) + + assert manifest_json.is_file() + with open(manifest_json, 'r') as f: + manifest = json.load(f) + assert isinstance(manifest["runs"], list) + assert len(manifest["runs"]) == 3 + assert len({run["run_id"] for run in manifest["runs"]}) == 3 + + +def test_serialize_overwites_file_if_not_json(fileutils): + test_dir = fileutils.get_test_dir() + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + manifest_json.parent.mkdir(parents=True, exist_ok=True) + with open(manifest_json, 'w') as f: + f.write("This is not a json\n") + + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + serialize.save_launch_manifest(lmb.finalize()) + with open(manifest_json, 'r') as f: + assert isinstance(json.load(f), dict) + + +def test_started_entities_are_serialized(fileutils): + exp_name = "test-exp" + test_dir = Path(fileutils.make_test_dir()) / exp_name + test_dir.mkdir(parents=True) + exp = Experiment(exp_name, exp_path=str(test_dir), launcher="local") + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + hello_world_model = exp.create_model("echo-hello", run_settings=rs1) + spam_eggs_model = exp.create_model("echo-spam", run_settings=rs2) + hello_ensemble = exp.create_ensemble('echo-ensemble', run_settings=rs1, replicas=3) + + exp.generate(hello_world_model, spam_eggs_model, hello_ensemble) + exp.start(hello_world_model, spam_eggs_model, block=False) + exp.start(hello_ensemble, block=False) + + manifest_json = Path(exp.exp_path) / _REL_MANIFEST_PATH + try: + with open(manifest_json, 'r') as f: + manifest = json.load(f) + assert len(manifest["runs"]) == 2 + assert len(manifest["runs"][0]["model"]) == 2 + assert len(manifest["runs"][0]["ensemble"]) == 0 + assert len(manifest["runs"][1]["model"]) == 0 + assert len(manifest["runs"][1]["ensemble"]) == 1 + assert len(manifest["runs"][1]["ensemble"][0]["models"]) == 3 + finally: + exp.stop(hello_world_model, spam_eggs_model, hello_ensemble) + + +def test_serialzed_database_does_not_break_if_using_a_non_standard_install( + monkeypatch +): + monkeypatch.setattr(utils, "get_db_path", lambda: None) + db = Orchestrator() + dict_ = serialize._dictify_db(db, []) + assert dict_["type"] == "Unknown" + + +def test_dictify_run_settings_warns_when_attepting_to_dictify_mpmd( + monkeypatch, caplog, fileutils +): + # TODO: Eventually this test should be removed and we should be able to + # handle MPMD run settings as part of the output dict + exp_name = "test-exp" + test_dir = Path(fileutils.make_test_dir()) / exp_name + test_dir.mkdir(parents=True) + exp = Experiment(exp_name, exp_path=str(test_dir), launcher="local") + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + # Make rs "MPMD" + monkeypatch.setattr(rs1, "mpmd", [rs2], raising=False) + # Make work with colored logs + monkeypatch.setattr(serialize, "_LOGGER", logging.getLogger()) + serialize._dictify_run_settings(rs1) + rec ,= caplog.records + assert rec.levelno == logging.WARNING + assert "MPMD run settings" in rec.msg diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py new file mode 100644 index 000000000..cf85e26e2 --- /dev/null +++ b/tests/test_telemetry_monitor.py @@ -0,0 +1,1139 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import logging +import pathlib +from random import sample +import pytest +import shutil +import sys +import typing as t +import time +import uuid +from conftest import FileUtils, MLUtils, WLMUtils +import smartsim + +from smartsim._core.control.jobmanager import JobManager +from smartsim._core.control.job import Job, JobEntity, _JobKey +from smartsim._core.launcher.launcher import WLMLauncher +from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher +from smartsim._core.launcher.step.step import Step, proxyable_launch_cmd +from smartsim._core.launcher.stepInfo import StepInfo +from smartsim.error.errors import UnproxyableStepError +from smartsim.settings.base import RunSettings +from smartsim.status import ( + STATUS_COMPLETED, + STATUS_CANCELLED, + STATUS_FAILED, + STATUS_NEW, + STATUS_PAUSED, + STATUS_RUNNING, + TERMINAL_STATUSES, +) +import smartsim._core.config.config as cfg + +from smartsim._core.entrypoints.telemetrymonitor import ( + can_shutdown, + event_loop, + faux_return_code, + get_parser, + get_ts, + track_event, + load_manifest, + hydrate_persistable, + ManifestEventHandler, +) +from smartsim._core.utils import serialize +from smartsim import Experiment + + +ALL_ARGS = {"-exp_dir", "-frequency"} +PROXY_ENTRY_POINT = "smartsim._core.entrypoints.indirect" +CFG_TM_ENABLED_ATTR = "telemetry_enabled" + + +for_all_wlm_launchers = pytest.mark.parametrize( + "wlm_launcher", + [pytest.param(cls(), id=cls.__name__) for cls in WLMLauncher.__subclasses__()], +) + +requires_wlm = pytest.mark.skipif( + pytest.test_launcher == "local", + reason="Test requires WLM" +) + + +logger = logging.getLogger() + +# The tests in this file belong to the slow_tests group +pytestmark = pytest.mark.slow_tests + + +@pytest.fixture(autouse=True) +def turn_on_tm(monkeypatch): + monkeypatch.setattr( + cfg.Config, + CFG_TM_ENABLED_ATTR, + property(lambda self: True)) + yield + + +def snooze_nonblocking(test_dir: str, max_delay: int = 20, post_data_delay: int = 2): + telmon_subdir = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + # let the non-blocking experiment complete. + for _ in range(max_delay): + time.sleep(1) + if telmon_subdir.exists(): + time.sleep(post_data_delay) + break + + +@pytest.mark.parametrize( + ["cmd", "missing"], + [ + pytest.param("", {"-exp_dir", "-frequency"}, id="no args"), + pytest.param("-exp_dir /foo/bar", {"-frequency"}, id="no freq"), + pytest.param("-frequency 123", {"-exp_dir"}, id="no dir"), + ], +) +def test_parser_reqd_args(capsys, cmd, missing): + """Test that the parser reports any missing required arguments""" + parser = get_parser() + + args = cmd.split() + + captured = capsys.readouterr() # throw away existing output + with pytest.raises(SystemExit) as ex: + ns = parser.parse_args(args) + + captured = capsys.readouterr() + assert "the following arguments are required" in captured.err + err_desc = captured.err.split("the following arguments are required:")[-1] + for arg in missing: + assert arg in err_desc + + expected = ALL_ARGS - missing + for exp in expected: + assert exp not in err_desc + + +def test_parser(): + """Test that the parser succeeds when receiving expected args""" + parser = get_parser() + + test_dir = "/foo/bar" + test_freq = 123 + + cmd = f"-exp_dir {test_dir} -frequency {test_freq}" + args = cmd.split() + + ns = parser.parse_args(args) + + assert ns.exp_dir == test_dir + assert ns.frequency == test_freq + + +def test_ts(): + """Ensure expected output type""" + ts = get_ts() + assert isinstance(ts, int) + + +@pytest.mark.parametrize( + ["etype", "task_id", "step_id", "timestamp", "evt_type"], + [ + pytest.param("ensemble", "", "123", get_ts(), "start", id="start event"), + pytest.param("ensemble", "", "123", get_ts(), "stop", id="stop event"), + ], +) +def test_track_event( + etype: str, + task_id: str, + step_id: str, + timestamp: int, + evt_type: str, + fileutils, +): + """Ensure that track event writes a file to the expected location""" + exp_dir = fileutils.make_test_dir() + exp_path = pathlib.Path(exp_dir) + track_event(timestamp, task_id, step_id, etype, evt_type, exp_path, logger) + + expected_output = exp_path / f"{evt_type}.json" + + assert expected_output.exists() + assert expected_output.is_file() + + +def test_load_manifest(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly""" + sample_manifest_path = fileutils.get_test_conf_path("telemetry/telemetry.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + test_manifest_path = fileutils.make_test_file( + serialize.MANIFEST_FILENAME, + serialize.TELMON_SUBDIR, + sample_manifest.read_text(), + ) + test_manifest = pathlib.Path(test_manifest_path) + assert test_manifest.exists() + + manifest = load_manifest(test_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/path/to/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 6 + + assert len(manifest.runs[0].models) == 1 + assert len(manifest.runs[2].models) == 8 # 8 models in ensemble + assert len(manifest.runs[0].orchestrators) == 0 + assert len(manifest.runs[1].orchestrators) == 3 # 3 shards in db + + +def test_load_manifest_colo_model(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing a colocated model""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path("telemetry/colocatedmodel.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert ( + str(manifest.path) + == "/tmp/my-exp" + ) + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 1 + + assert len(manifest.runs[0].models) == 1 + + +def test_load_manifest_serial_models(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing multiple models""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path("telemetry/serialmodels.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/tmp/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 1 + + assert len(manifest.runs[0].models) == 5 + + +def test_load_manifest_db_and_models(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing models & + orchestrator across 2 separate runs""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path("telemetry/db_and_model.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/tmp/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 2 + + assert len(manifest.runs[0].orchestrators) == 1 + assert len(manifest.runs[1].models) == 1 + + +def test_load_manifest_db_and_models_1run(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing models & + orchestrator in a single run""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path( + "telemetry/db_and_model_1run.json" + ) + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/tmp/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 1 + + assert len(manifest.runs[0].orchestrators) == 1 + assert len(manifest.runs[0].models) == 1 + + +@pytest.mark.parametrize( + ["task_id", "step_id", "etype", "exp_isorch", "exp_ismanaged"], + [ + pytest.param("123", "", "model", False, False, id="unmanaged, non-orch"), + pytest.param("456", "123", "ensemble", False, True, id="managed, non-orch"), + pytest.param("789", "987", "orchestrator", True, True, id="managed, orch"), + pytest.param("987", "", "orchestrator", True, False, id="unmanaged, orch"), + ], +) +def test_persistable_computed_properties( + task_id: str, step_id: str, etype: str, exp_isorch: bool, exp_ismanaged: bool +): + name = f"test-{etype}-{uuid.uuid4()}" + timestamp = get_ts() + exp_dir = pathlib.Path("/foo/bar") + stored = { + "name": name, + "run_id": timestamp, + "telemetry_metadata": { + "status_dir": str(exp_dir), + "task_id": task_id, + "step_id": step_id, + }, + } + persistables = hydrate_persistable(etype, stored, exp_dir) + persistable = persistables[0] if persistables else None + + assert persistable.is_managed == exp_ismanaged + assert persistable.is_db == exp_isorch + + +def test_deserialize_ensemble(fileutils: FileUtils): + """Ensure that the children of ensembles (models) are correctly + placed in the models collection""" + sample_manifest_path = fileutils.get_test_conf_path("telemetry/ensembles.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest + + assert len(manifest.runs) == 1 + + # NOTE: no longer returning ensembles, only children... + # assert len(manifest.runs[0].ensembles) == 1 + assert len(manifest.runs[0].models) == 8 + + +def test_shutdown_conditions(): + """Ensure conditions to shutdown telemetry monitor are correctly evaluated""" + job_entity1 = JobEntity() + job_entity1.name = "xyz" + job_entity1.step_id = "123" + job_entity1.task_id = "" + + logger = logging.getLogger() + + # show that an event handler w/no monitored jobs can shutdown + mani_handler = ManifestEventHandler("xyz", logger) + assert can_shutdown(mani_handler, logger) + + # show that an event handler w/a monitored job cannot shutdown + mani_handler = ManifestEventHandler("xyz", logger) + mani_handler.job_manager.add_job( + job_entity1.name, job_entity1.step_id, job_entity1, False + ) + assert not can_shutdown(mani_handler, logger) + assert not bool(mani_handler.job_manager.db_jobs) + assert bool(mani_handler.job_manager.jobs) + + # show that an event handler w/a monitored db cannot shutdown + mani_handler = ManifestEventHandler("xyz", logger) + job_entity1.type = "orchestrator" + mani_handler.job_manager.add_job( + job_entity1.name, job_entity1.step_id, job_entity1, False + ) + assert not can_shutdown(mani_handler, logger) + assert bool(mani_handler.job_manager.db_jobs) + assert not bool(mani_handler.job_manager.jobs) + + # show that an event handler w/a dbs & tasks cannot shutdown + job_entity2 = JobEntity() + job_entity2.name = "xyz" + job_entity2.step_id = "123" + job_entity2.task_id = "" + + mani_handler = ManifestEventHandler("xyz", logger) + job_entity1.type = "orchestrator" + mani_handler.job_manager.add_job( + job_entity1.name, job_entity1.step_id, job_entity1, False + ) + + mani_handler.job_manager.add_job( + job_entity2.name, job_entity2.step_id, job_entity2, False + ) + assert not can_shutdown(mani_handler, logger) + assert bool(mani_handler.job_manager.db_jobs) + assert bool(mani_handler.job_manager.jobs) + + # ... now, show that removing 1 of 2 jobs still doesn't shutdown + mani_handler.job_manager.db_jobs.popitem() + assert not can_shutdown(mani_handler, logger) + + # ... now, show that removing final job will allow shutdown + mani_handler.job_manager.jobs.popitem() + assert can_shutdown(mani_handler, logger) + + +def test_auto_shutdown(): + """Ensure that the cooldown timer is respected""" + + class FauxObserver: + def __init__(self): + self.stop_count = 0 + + def stop(self): + self.stop_count += 1 + + def is_alive(self) -> bool: + if self.stop_count > 0: + return False + + return True + + job_entity1 = JobEntity() + job_entity1.name = "xyz" + job_entity1.step_id = "123" + job_entity1.task_id = "" + + frequency = 1 + + # show that an event handler w/out a monitored task will automatically stop + mani_handler = ManifestEventHandler("xyz", logger) + observer = FauxObserver() + duration = 2 + + ts0 = get_ts() + event_loop(observer, mani_handler, frequency, logger, cooldown_duration=duration) + ts1 = get_ts() + + assert ts1 - ts0 >= duration + assert observer.stop_count == 1 + + # show that the new cooldown duration is respected + mani_handler = ManifestEventHandler("xyz", logger) + observer = FauxObserver() + duration = 5 + + ts0 = get_ts() + event_loop(observer, mani_handler, frequency, logger, cooldown_duration=duration) + ts1 = get_ts() + + assert ts1 - ts0 >= duration + assert observer.stop_count == 1 + + +def test_telemetry_single_model(fileutils, wlmutils): + """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp + with unique db_identifiers""" + + # Set experiment name + exp_name = "telemetry_single_model" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_dir = fileutils.make_test_dir() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("perroquet", app_settings) + exp.generate(smartsim_model) + exp.start(smartsim_model, block=True) + assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +def test_telemetry_single_model_nonblocking(fileutils, wlmutils, monkeypatch): + """Ensure that the telemetry monitor logs exist when the experiment + is non-blocking""" + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "test_telemetry_single_model_nonblocking" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_dir = fileutils.make_test_dir() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("perroquet", app_settings) + exp.generate(smartsim_model) + exp.start(smartsim_model) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + + assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +def test_telemetry_serial_models(fileutils, wlmutils, monkeypatch): + """ + Test telemetry with models being run in serial (one after each other) + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_serial_models" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_dir = fileutils.make_test_dir() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_models = [ + exp.create_model(f"perroquet_{i}", app_settings) for i in range(5) + ] + exp.generate(*smartsim_models) + exp.start(*smartsim_models, block=True) + assert all( + [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] + ) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 5 + assert len(stop_events) == 5 + + +def test_telemetry_serial_models_nonblocking(fileutils, wlmutils, monkeypatch): + """ + Test telemetry with models being run in serial (one after each other) + in a non-blocking experiment + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_serial_models" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_dir = fileutils.make_test_dir() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_models = [ + exp.create_model(f"perroquet_{i}", app_settings) for i in range(5) + ] + exp.generate(*smartsim_models) + exp.start(*smartsim_models) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + + assert all( + [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] + ) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 5 + assert len(stop_events) == 5 + + +def test_telemetry_db_only_with_generate(fileutils, wlmutils, monkeypatch): + """ + Test telemetry with only a database running + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_db_with_generate" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_dir = fileutils.make_test_dir() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create regular database + orc = exp.create_database(port=test_port, interface=test_interface) + exp.generate(orc) + try: + exp.start(orc, block=True) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) <= 1 + finally: + exp.stop(orc) + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + + assert exp.get_status(orc)[0] == STATUS_CANCELLED + + stop_events = list(telemetry_output_path.rglob("stop.json")) + assert len(stop_events) == 1 + + +def test_telemetry_db_only_without_generate(fileutils, wlmutils, monkeypatch): + """ + Test telemetry with only a database running + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_db_only_without_generate" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_dir = fileutils.make_test_dir() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create regular database + orc = exp.create_database(port=test_port, interface=test_interface) + try: + exp.start(orc) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 0 + finally: + exp.stop(orc) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + assert exp.get_status(orc)[0] == STATUS_CANCELLED + + stop_events = list(telemetry_output_path.rglob("stop.json")) + assert len(stop_events) == 1 + + +def test_telemetry_db_and_model(fileutils, wlmutils, monkeypatch): + """ + Test telemetry with only a database running + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_db_and_model" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_dir = fileutils.make_test_dir() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create regular database + orc = exp.create_database(port=test_port, interface=test_interface) + try: + exp.start(orc) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("perroquet", app_settings) + exp.generate(smartsim_model) + exp.start(smartsim_model, block=True) + finally: + exp.stop(orc) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + + assert exp.get_status(orc)[0] == STATUS_CANCELLED + assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + + start_events = list(telemetry_output_path.rglob("database/**/start.json")) + stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 1 + + start_events = list(telemetry_output_path.rglob("model/**/start.json")) + stop_events = list(telemetry_output_path.rglob("model/**/stop.json")) + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +def test_telemetry_ensemble(fileutils, wlmutils, monkeypatch): + """ + Test telemetry with only a database running + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_ensemble" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_dir = fileutils.make_test_dir() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + ens = exp.create_ensemble("troupeau", run_settings=app_settings, replicas=5) + exp.generate(ens) + exp.start(ens, block=True) + assert all([status == STATUS_COMPLETED for status in exp.get_status(ens)]) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 5 + assert len(stop_events) == 5 + + +def test_telemetry_colo(fileutils, wlmutils, coloutils, monkeypatch): + """ + Test telemetry with only a database running + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_ensemble" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_dir = fileutils.make_test_dir() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + smartsim_model = coloutils.setup_test_colo( + fileutils, + "uds", + exp, + "echo.py", + {}, + ) + + exp.generate(smartsim_model) + exp.start(smartsim_model, block=True) + assert all( + [status == STATUS_COMPLETED for status in exp.get_status(smartsim_model)] + ) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + # the colodb does NOT show up as a unique entity in the telemetry + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +@pytest.mark.parametrize( + "frequency, cooldown", + [ + pytest.param(1, 1, id="1s shutdown"), + pytest.param(1, 5, id="5s shutdown"), + pytest.param(1, 15, id="15s shutdown"), + ], +) +def test_telemetry_autoshutdown(fileutils, wlmutils, monkeypatch, frequency, cooldown): + """ + Ensure that the telemetry monitor process shuts down after the desired + cooldown period + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", frequency) + ctx.setattr(cfg.Config, "telemetry_cooldown", cooldown) + + # Set experiment name + exp_name = "telemetry_ensemble" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_dir = fileutils.make_test_dir() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + start_time = get_ts() + stop_time = start_time + exp.start(block=False) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + empty_mani = list(telemetry_output_path.rglob("manifest.json")) + assert len(empty_mani) == 1, "an manifest.json should be created" + + popen = exp._control._telemetry_monitor + assert popen.pid > 0 + assert popen.returncode is None + + # give some leeway during testing for the cooldown to get hit + for i in range(10): + if popen.poll() is not None: + stop_time = get_ts() + print(f"Completed polling for telemetry shutdown after {i} attempts") + break + time.sleep(3) + + assert popen.returncode is not None + assert stop_time >= (start_time + cooldown) + + +class MockStep(Step): + """Mock step to implement any abstract methods so that it can be + instanced for test purposes + """ + + def get_launch_cmd(self): + return ["spam", "eggs"] + + +@pytest.fixture +def mock_step_meta_dict(fileutils): + test_dir = fileutils.make_test_dir() + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + yield { + "entity_type": "mock", + "status_dir": telemetry_output_path, + } + + +@pytest.fixture +def mock_step(fileutils, mock_step_meta_dict): + test_dir = fileutils.make_test_dir() + rs = RunSettings("echo") + step = MockStep("mock-step", test_dir, rs) + step.meta = mock_step_meta_dict + yield step + + +def test_proxy_launch_cmd_decorator_reformats_cmds(mock_step, monkeypatch): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) + get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) + cmd = get_launch_cmd(mock_step) + assert cmd != ["some", "cmd", "list"] + assert sys.executable in cmd + assert PROXY_ENTRY_POINT in cmd + + +def test_proxy_launch_cmd_decorator_does_not_reformat_cmds_if_the_tm_is_off( + mock_step, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, False) + get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) + cmd = get_launch_cmd(mock_step) + assert cmd == ["some", "cmd", "list"] + + +def test_proxy_launch_cmd_decorator_errors_if_attempt_to_proxy_a_managed_step( + mock_step, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) + mock_step.managed = True + get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) + with pytest.raises(UnproxyableStepError): + get_launch_cmd(mock_step) + + +@for_all_wlm_launchers +def test_unmanaged_steps_are_proxyed_through_indirect( + wlm_launcher, mock_step_meta_dict, fileutils, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) + test_dir = fileutils.make_test_dir() + rs = RunSettings("echo", ["hello", "world"]) + step = wlm_launcher.create_step("test-step", test_dir, rs) + step.meta = mock_step_meta_dict + assert isinstance(step, Step) + assert not step.managed + cmd = step.get_launch_cmd() + assert sys.executable in cmd + assert PROXY_ENTRY_POINT in cmd + assert "hello" not in cmd + assert "world" not in cmd + + +@for_all_wlm_launchers +def test_unmanaged_steps_are_not_proxied_if_the_telemetry_monitor_is_disabled( + wlm_launcher, mock_step_meta_dict, fileutils, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, False) + test_dir = fileutils.make_test_dir() + rs = RunSettings("echo", ["hello", "world"]) + step = wlm_launcher.create_step("test-step", test_dir, rs) + step.meta = mock_step_meta_dict + assert isinstance(step, Step) + assert not step.managed + cmd = step.get_launch_cmd() + assert PROXY_ENTRY_POINT not in cmd + assert "hello" in cmd + assert "world" in cmd + + +@requires_wlm +@pytest.mark.parametrize( + "run_command", + [ + pytest.param("", id="Unmanaged"), + pytest.param("auto", id="Managed"), + ], +) +def test_multistart_experiment( + wlmutils: WLMUtils, + fileutils: FileUtils, + monkeypatch: pytest.MonkeyPatch, + run_command: str, +): + """Run an experiment with multiple start calls to ensure that telemetry is + saved correctly for each run + """ + test_dir = fileutils.make_test_dir(sub_dir=str(uuid.uuid4())) + + exp_name = "my-exp" + exp = Experiment(exp_name, + launcher=wlmutils.get_test_launcher(), + exp_path=test_dir) + rs_e = exp.create_run_settings( + sys.executable, ["printing_model.py"], run_command=run_command + ) + rs_e.set_nodes(1) + rs_e.set_tasks(1) + ens = exp.create_ensemble( + "my-ens", + run_settings=rs_e, + perm_strategy="all_perm", + params={ + "START": ["spam"], + "MID": ["eggs"], + "END": ["sausage", "and spam"], + }, + ) + + test_script_path = fileutils.get_test_conf_path("printing_model.py") + ens.attach_generator_files(to_configure=[test_script_path]) + + rs_m = exp.create_run_settings("echo", ["hello", "world"], run_command=run_command) + rs_m.set_nodes(1) + rs_m.set_tasks(1) + model = exp.create_model("my-model", run_settings=rs_m) + + db = exp.create_database( + db_nodes=1, + port=wlmutils.get_test_port(), + interface=wlmutils.get_test_interface(), + ) + + exp.generate(db, ens, model, overwrite=True) + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + ctx.setattr(cfg.Config, "telemetry_cooldown", 45) + + exp.start(model, block=False) + + # track PID to see that telmon cooldown avoids restarting process + tm_pid = exp._control._telemetry_monitor.pid + + exp.start(db, block=False) + # check that same TM proc is active + assert tm_pid == exp._control._telemetry_monitor.pid + try: + exp.start(ens, block=True, summary=True) + finally: + exp.stop(db) + assert tm_pid == exp._control._telemetry_monitor.pid + time.sleep(3) # time for telmon to write db stop event + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + + db_start_events = list(telemetry_output_path.rglob("database/**/start.json")) + db_stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) + assert len(db_start_events) == 1 + assert len(db_stop_events) == 1 + + m_start_events = list(telemetry_output_path.rglob("model/**/start.json")) + m_stop_events = list(telemetry_output_path.rglob("model/**/stop.json")) + assert len(m_start_events) == 1 + assert len(m_stop_events) == 1 + + e_start_events = list(telemetry_output_path.rglob("ensemble/**/start.json")) + e_stop_events = list(telemetry_output_path.rglob("ensemble/**/stop.json")) + assert len(e_start_events) == 2 + assert len(e_stop_events) == 2 + + +@pytest.mark.parametrize( + "status_in, expected_out", + [ + pytest.param(STATUS_CANCELLED, 1, id="failure on cancellation"), + pytest.param(STATUS_COMPLETED, 0, id="success on completion"), + pytest.param(STATUS_FAILED, 1, id="failure on failed"), + pytest.param(STATUS_NEW, None, id="failure on new"), + pytest.param(STATUS_PAUSED, None, id="failure on paused"), + pytest.param(STATUS_RUNNING, None, id="failure on running"), + ], +) +def test_faux_rc(status_in: str, expected_out: t.Optional[int]): + """Ensure faux response codes match expectations.""" + step_info = StepInfo(status=status_in) + + rc = faux_return_code(step_info) + assert rc == expected_out + + +@pytest.mark.parametrize( + "status_in, expected_out, expected_has_jobs", + [ + pytest.param(STATUS_CANCELLED, 1, False, id="failure on cancellation"), + pytest.param(STATUS_COMPLETED, 0, False, id="success on completion"), + pytest.param(STATUS_FAILED, 1, False, id="failure on failed"), + pytest.param(STATUS_NEW, None, True, id="failure on new"), + pytest.param(STATUS_PAUSED, None, True, id="failure on paused"), + pytest.param(STATUS_RUNNING, None, True, id="failure on running"), + ], +) +def test_wlm_completion_handling( + fileutils: FileUtils, + monkeypatch: pytest.MonkeyPatch, + status_in: str, + expected_out: t.Optional[int], + expected_has_jobs: bool, +): + test_dir = fileutils.make_test_dir(sub_dir=str(uuid.uuid4())) + + def get_faux_update(status: str) -> t.Callable: + def _faux_updates(_self: WLMLauncher, _names: t.List[str]) -> t.List[StepInfo]: + return [("faux-name", StepInfo(status=status))] + return _faux_updates + + ts = get_ts() + with monkeypatch.context() as ctx: + # don't actually start a job manager + ctx.setattr(JobManager, "start", lambda x: ...) + ctx.setattr(SlurmLauncher, "get_step_update", get_faux_update(status_in)) + + mani_handler = ManifestEventHandler("xyz", logger) + mani_handler.set_launcher("slurm") + + # prep a fake job to request updates for + job_entity = JobEntity() + job_entity.name = "faux-name" + job_entity.step_id = "faux-step-id" + job_entity.task_id = 1234 + job_entity.status_dir = test_dir + job_entity.type = "orchestrator" + + job = Job(job_entity.name, job_entity.step_id, job_entity, "slurm", True) + + # populate our tracking collections + mani_handler._tracked_jobs = {job_entity.key: job_entity} + mani_handler.job_manager.jobs[job.name] = job + + mani_handler.on_timestep(ts) + + # see that the job queue was properly manipulated + has_jobs = bool(mani_handler._tracked_jobs) + assert expected_has_jobs == has_jobs + + # see that the event was properly written + stop_event_path = pathlib.Path(test_dir) / "stop.json" + + # if a status wasn't terminal, no stop event should have been written + should_have_stop_event = False if expected_out is None else True + assert should_have_stop_event == stop_event_path.exists()