From ab69e0ced2ff93f5ae1498d20f508e866204d96f Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Wed, 16 Oct 2024 16:52:16 -0500 Subject: [PATCH 01/16] [skip ci] Remove use of dead dragon attrs --- .../_core/launcher/dragon/dragonBackend.py | 124 ++++++++---------- 1 file changed, 53 insertions(+), 71 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 4aba60d558..1338cb0e38 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -23,6 +23,7 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import collections import functools import itertools @@ -38,7 +39,6 @@ # isort: off import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy -import dragon.native.group_state as dragon_group_state import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group import dragon.native.machine as dragon_machine @@ -67,34 +67,48 @@ logger = get_logger(__name__) -class DragonStatus(str, Enum): - ERROR = str(dragon_group_state.Error()) - RUNNING = str(dragon_group_state.Running()) - - def __str__(self) -> str: - return self.value - - @dataclass class ProcessGroupInfo: status: SmartSimStatus """Status of step""" process_group: t.Optional[dragon_process_group.ProcessGroup] = None """Internal Process Group object, None for finished or not started steps""" - puids: t.Optional[t.List[t.Optional[int]]] = None # puids can be None - """List of Process UIDS belonging to the ProcessGroup""" - return_codes: t.Optional[t.List[int]] = None - """List of return codes of completed processes""" hosts: t.List[str] = field(default_factory=list) """List of hosts on which the Process Group """ redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None """Workers used to redirect stdout and stderr to file""" @property - def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.Optional[t.List[int]]]: + def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.List[int]]: """Information needed by SmartSim Launcher and Job Manager""" return (self.status, self.return_codes) + @property + def puids(self) -> t.List[int]: + """List of Process UIDS belonging to the ProcessGroup""" + return list(set(itertools.chain(self.active_puids, self.inactive_puids))) + + @property + def active_puids(self) -> t.List[int]: + if self.process_group is None: + return [] + return list(self.process_group.puids) + + @property + def inactive_puids(self) -> t.List[int]: + if self.process_group is None: + return [] + return [puid for puid, _ in self.process_group.inactive_puids] + + @property + def return_codes(self) -> t.List[int]: + """List of return codes of completed processes""" + if self.process_group is None: + return [-1] + if self.status == SmartSimStatus.STATUS_CANCELLED: + return [-9] + return [ret for _, ret in self.process_group.inactive_puids] + def __str__(self) -> str: if self.process_group is not None and self.redir_workers is not None: msg = [f"Active Group ({self.status})"] @@ -105,7 +119,7 @@ def __str__(self) -> str: if self.hosts is not None: msg.append(f"Hosts: {','.join(self.hosts)}") - if self.return_codes is not None: + if self.return_codes: msg.append(f"{self.return_codes}") return ", ".join(msg) @@ -404,10 +418,10 @@ def _stop_steps(self) -> None: else: # Technically we could just terminate, but what if # the application intercepts that and ignores it? - proc_group = self._group_infos[step_id].process_group + group_info = self._group_infos[step_id] if ( - proc_group is not None - and proc_group.status == DragonStatus.RUNNING + group_info.active_puids + and (proc_group := group_info.process_group) is not None ): try: proc_group.kill() @@ -416,7 +430,7 @@ def _stop_steps(self) -> None: proc_group.stop() except dragon_process_group.DragonProcessGroupError: logger.error("Process group already stopped") - redir_group = self._group_infos[step_id].redir_workers + redir_group = group_info.redir_workers if redir_group is not None: try: redir_group.join(0.1) @@ -425,7 +439,6 @@ def _stop_steps(self) -> None: logger.error(e) self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED - self._group_infos[step_id].return_codes = [-9] @staticmethod def create_run_policy( @@ -438,7 +451,6 @@ def create_run_policy( if isinstance(request, DragonRunRequest): run_request: DragonRunRequest = request - affinity = dragon_policy.Policy.Affinity.DEFAULT cpu_affinity: t.List[int] = [] gpu_affinity: t.List[int] = [] @@ -446,25 +458,20 @@ def create_run_policy( if run_request.policy is not None: # Affinities are not mutually exclusive. If specified, both are used if run_request.policy.cpu_affinity: - affinity = dragon_policy.Policy.Affinity.SPECIFIC cpu_affinity = run_request.policy.cpu_affinity if run_request.policy.gpu_affinity: - affinity = dragon_policy.Policy.Affinity.SPECIFIC gpu_affinity = run_request.policy.gpu_affinity logger.debug( - f"Affinity strategy: {affinity}, " f"CPU affinity mask: {cpu_affinity}, " f"GPU affinity mask: {gpu_affinity}" ) - if affinity != dragon_policy.Policy.Affinity.DEFAULT: - return dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=node_name, - affinity=affinity, - cpu_affinity=cpu_affinity, - gpu_affinity=gpu_affinity, - ) + return dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=node_name, + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) return dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, @@ -513,22 +520,19 @@ def _start_steps(self) -> None: logger.error(e) grp_status = SmartSimStatus.STATUS_FAILED - puids = None try: - puids = list( - set(grp.puids + [puid for puid, retcode in grp.inactive_puids]) - ) - self._group_infos[step_id] = ProcessGroupInfo( + grp_info = ProcessGroupInfo( process_group=grp, - puids=puids, - return_codes=[], status=grp_status, hosts=hosts, ) + puids = grp_info.puids + self._group_infos[step_id] = grp_info self._running_steps.append(step_id) started.append(step_id) except Exception as e: logger.error(e) + puids = None if ( puids is not None @@ -575,32 +579,15 @@ def _refresh_statuses(self) -> None: grp = group_info.process_group if grp is None: group_info.status = SmartSimStatus.STATUS_FAILED - group_info.return_codes = [-1] elif group_info.status not in TERMINAL_STATUSES: - if grp.status == str(DragonStatus.RUNNING): + if group_info.active_puids: group_info.status = SmartSimStatus.STATUS_RUNNING - else: - puids = group_info.puids - if puids is not None and all( - puid is not None for puid in puids - ): - try: - group_info.return_codes = [ - dragon_process.Process(None, ident=puid).returncode - for puid in puids - ] - except (ValueError, TypeError) as e: - logger.error(e) - group_info.return_codes = [-1 for _ in puids] - else: - group_info.return_codes = [0] - if not group_info.status == SmartSimStatus.STATUS_CANCELLED: - group_info.status = ( - SmartSimStatus.STATUS_FAILED - if any(group_info.return_codes) - or grp.status == DragonStatus.ERROR - else SmartSimStatus.STATUS_COMPLETED - ) + elif group_info.status != SmartSimStatus.STATUS_CANCELLED: + group_info.status = ( + SmartSimStatus.STATUS_FAILED + if any(group_info.return_codes) + else SmartSimStatus.STATUS_COMPLETED + ) if group_info.status in TERMINAL_STATUSES: terminated.append(step_id) @@ -620,7 +607,7 @@ def _refresh_statuses(self) -> None: except KeyError: logger.error(f"Tried to free a non-allocated host: {host}") self._free_hosts.append(host) - group_info.process_group = None + # group_info.process_group = None group_info.redir_workers = None def _update_shutdown_status(self) -> None: @@ -685,7 +672,7 @@ def _(self, request: DragonRunRequest) -> DragonRunResponse: honorable, err = self._can_honor(request) if not honorable: self._group_infos[step_id] = ProcessGroupInfo( - status=SmartSimStatus.STATUS_FAILED, return_codes=[-1] + status=SmartSimStatus.STATUS_FAILED ) else: self._queued_steps[step_id] = request @@ -751,12 +738,7 @@ def _proc_group_info_table_line( else: table_line.append("") - if proc_group_info.return_codes is not None: - table_line.append( - f"{','.join(str(ret) for ret in proc_group_info.return_codes)}" - ) - else: - table_line.append("") + table_line.append(",".join(str(ret) for ret in proc_group_info.return_codes)) if proc_group_info.puids is not None: table_line.append(f"{len(proc_group_info.puids)}") From 4456a21992b55506ec424fc9e780c8d638874d05 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Fri, 18 Oct 2024 13:04:06 -0500 Subject: [PATCH 02/16] Pull the correct dragon version --- smartsim/_core/_cli/scripts/dragon_install.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index 8028b8ecfd..65f80654e1 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -51,7 +51,7 @@ def python_version() -> str: def dragon_pin() -> str: """Return a string indicating the pinned major/minor version of the dragon package to install""" - return "0.9" + return "0.10" def _platform_filter(asset_name: str) -> bool: @@ -60,7 +60,7 @@ def _platform_filter(asset_name: str) -> bool: :param asset_name: A value to inspect for keywords indicating a Cray EX asset :returns: True if supplied value is correct for current platform""" - key = "crayex" + key = "hsn" is_cray = key in asset_name.lower() if is_crayex_platform(): return is_cray From de40bc9a0c72e003df9e13d1a26b772031992a1d Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Fri, 18 Oct 2024 14:11:12 -0500 Subject: [PATCH 03/16] Make CI happy --- doc/changelog.md | 7 +++++++ smartsim/_core/launcher/dragon/dragonBackend.py | 1 - 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/changelog.md b/doc/changelog.md index 8f93a1ae2c..18967708fc 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,11 +13,18 @@ To be released at some point in the future Description +- Update the `DragonBackend` to use + [Dragon V0.10](https://github.com/DragonHPC/dragon/releases/tag/v0.10-beta) - Implement workaround for Tensorflow that allows RedisAI to build with GCC-14 - Add instructions for installing SmartSim on PML's Scylla Detailed Notes +- Dragon V0.10 introduced support for infiniband networks and largely + overhauled the ``ProcessGroup`` API, used widely throughout SmartSim's + ``DragonBackend``, for better readability and debugging. SmartSim has has + adopted this new version of Dragon to take advantage of these improvements. + ([SmartSim-PR753](https://github.com/CrayLabs/SmartSim/pull/753)) - In libtensorflow, the input argument to TF_SessionRun seems to be mistyped to TF_Output instead of TF_Input. These two types differ only in name. GCC-14 catches this and throws an error, even though earlier versions allow this. To diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 1338cb0e38..078aa16c4a 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -30,7 +30,6 @@ import time import typing as t from dataclasses import dataclass, field -from enum import Enum from threading import RLock from tabulate import tabulate From 890015723b81b5b04fedc5dae2aee5510c65d6d4 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Fri, 18 Oct 2024 15:04:53 -0500 Subject: [PATCH 04/16] Make tests happy --- smartsim/_core/_cli/scripts/dragon_install.py | 1 - tests/test_dragon_installer.py | 74 +++++++++++-------- 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index 65f80654e1..b3b1f6f982 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -132,7 +132,6 @@ def filter_assets(assets: t.Collection[GitReleaseAsset]) -> t.Optional[GitReleas def retrieve_asset_info() -> GitReleaseAsset: """Find a release asset that meets all necessary filtering criteria - :param dragon_pin: identify the dragon version to install (e.g. dragon-0.8) :returns: A GitHub release asset""" assets = _get_release_assets() asset = filter_assets(assets) diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py index b23a1a7ef0..a2a72000f9 100644 --- a/tests/test_dragon_installer.py +++ b/tests/test_dragon_installer.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import itertools import pathlib import sys import tarfile @@ -102,23 +103,25 @@ def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset] assets: t.List[GitReleaseAsset] = [] mock_archive_name_tpl = "{}-{}.4.1-{}ac132fe95.tar.gz" - for python_version in ["py3.9", "py3.10", "py3.11"]: - for dragon_version in ["dragon-0.8", "dragon-0.9", "dragon-0.10"]: - for platform in ["", "CRAYEX-"]: - - asset = GitReleaseAsset(requester, headers, attributes, completed) - - archive_name = mock_archive_name_tpl.format( - dragon_version, python_version, platform - ) - - monkeypatch.setattr( - asset, - "_browser_download_url", - _git_attr(value=f"http://foo/{archive_name}"), - ) - monkeypatch.setattr(asset, "_name", _git_attr(value=archive_name)) - assets.append(asset) + for python_version, dragon_version, platform in itertools.chain( + itertools.product( + ["py3.9", "py3.10", "py3.11"], ["dragon-0.8", "dragon-0.9"], ["", "CRAYEX-"] + ), + itertools.product( + ["py3.9", "py3.10", "py3.11"], ["dragon-0.10", "dragon-0.11"], ["", "HSN-"] + ), + ): + asset = GitReleaseAsset(requester, headers, attributes, completed) + archive_name = mock_archive_name_tpl.format( + dragon_version, python_version, platform + ) + monkeypatch.setattr( + asset, + "_browser_download_url", + _git_attr(value=f"http://foo/{archive_name}"), + ) + monkeypatch.setattr(asset, "_name", _git_attr(value=archive_name)) + assets.append(asset) return assets @@ -187,26 +190,39 @@ def test_retrieve_cached( @pytest.mark.parametrize( "dragon_pin,pyv,is_found,is_crayex", [ + # Dragon V0.8 pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8"), pytest.param("0.8", "py3.9", True, False, id="0.8,python 3.9"), pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10"), pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11"), pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12"), - pytest.param("0.8", "py3.8", False, True, id="0.8,python 3.8,CrayEX"), - pytest.param("0.8", "py3.9", True, True, id="0.8,python 3.9,CrayEX"), - pytest.param("0.8", "py3.10", True, True, id="0.8,python 3.10,CrayEX"), - pytest.param("0.8", "py3.11", True, True, id="0.8,python 3.11,CrayEX"), - pytest.param("0.8", "py3.12", False, True, id="0.8,python 3.12,CrayEX"), + pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8,CrayEX"), + pytest.param("0.8", "py3.9", True, False, id="0.8,python 3.9,CrayEX"), + pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10,CrayEX"), + pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11,CrayEX"), + pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12,CrayEX"), + # Dragon V0.9 pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8"), pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9"), pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10"), pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11"), pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12"), - pytest.param("0.9", "py3.8", False, True, id="0.9,python 3.8,CrayEX"), - pytest.param("0.9", "py3.9", True, True, id="0.9,python 3.9,CrayEX"), - pytest.param("0.9", "py3.10", True, True, id="0.9,python 3.10,CrayEX"), - pytest.param("0.9", "py3.11", True, True, id="0.9,python 3.11,CrayEX"), - pytest.param("0.9", "py3.12", False, True, id="0.9,python 3.12,CrayEX"), + pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8,CrayEX"), + pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9,CrayEX"), + pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10,CrayEX"), + pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11,CrayEX"), + pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12,CrayEX"), + # Dragon V0.10 + pytest.param("0.10", "py3.8", False, False, id="0.10,python 3.8"), + pytest.param("0.10", "py3.9", True, False, id="0.10,python 3.9"), + pytest.param("0.10", "py3.10", True, False, id="0.10,python 3.10"), + pytest.param("0.10", "py3.11", True, False, id="0.10,python 3.11"), + pytest.param("0.10", "py3.12", False, False, id="0.10,python 3.12"), + pytest.param("0.10", "py3.8", False, True, id="0.10,python 3.8,CrayEX"), + pytest.param("0.10", "py3.9", True, True, id="0.10,python 3.9,CrayEX"), + pytest.param("0.10", "py3.10", True, True, id="0.10,python 3.10,CrayEX"), + pytest.param("0.10", "py3.11", True, True, id="0.10,python 3.11,CrayEX"), + pytest.param("0.10", "py3.12", False, True, id="0.10,python 3.12,CrayEX"), # add a couple variants for a dragon version that isn't in the asset list pytest.param("0.7", "py3.9", False, False, id="0.7,python 3.9"), pytest.param("0.7", "py3.9", False, True, id="0.7,python 3.9,CrayEX"), @@ -254,9 +270,9 @@ def test_retrieve_asset_info( assert dragon_pin in chosen_asset.name if is_crayex: - assert "crayex" in chosen_asset.name.lower() + assert "hsn" in chosen_asset.name.lower() else: - assert "crayex" not in chosen_asset.name.lower() + assert "hsn" not in chosen_asset.name.lower() else: with pytest.raises(SmartSimCLIActionCancelled): retrieve_asset_info() From 11f3be281915445982053167c985aa34757372ca Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Fri, 25 Oct 2024 10:43:36 -0700 Subject: [PATCH 05/16] CrayEx -> HSN in tests --- tests/test_dragon_installer.py | 70 +++++++++++++++++----------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py index a2a72000f9..33facb9560 100644 --- a/tests/test_dragon_installer.py +++ b/tests/test_dragon_installer.py @@ -51,7 +51,7 @@ pytestmark = pytest.mark.group_a -mock_archive_name = "dragon-0.8-py3.9.4.1-CRAYEX-ac132fe95.tar.gz" +mock_archive_name = "dragon-0.10-py3.9.4.1-HSN-ac132fe95.tar.gz" _git_attr = namedtuple("_git_attr", "value") @@ -188,7 +188,7 @@ def test_retrieve_cached( @pytest.mark.parametrize( - "dragon_pin,pyv,is_found,is_crayex", + "dragon_pin,pyv,is_found,is_hsn", [ # Dragon V0.8 pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8"), @@ -196,36 +196,36 @@ def test_retrieve_cached( pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10"), pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11"), pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12"), - pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8,CrayEX"), - pytest.param("0.8", "py3.9", True, False, id="0.8,python 3.9,CrayEX"), - pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10,CrayEX"), - pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11,CrayEX"), - pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12,CrayEX"), + pytest.param("0.8", "py3.8", False, True, id="0.8,python 3.8,HSN"), + pytest.param("0.8", "py3.9", False, True, id="0.8,python 3.9,HSN"), + pytest.param("0.8", "py3.10", False, True, id="0.8,python 3.10,HSN"), + pytest.param("0.8", "py3.11", False, True, id="0.8,python 3.11,HSN"), + pytest.param("0.8", "py3.12", False, True, id="0.8,python 3.12,HSN"), # Dragon V0.9 pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8"), pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9"), pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10"), pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11"), pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12"), - pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8,CrayEX"), - pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9,CrayEX"), - pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10,CrayEX"), - pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11,CrayEX"), - pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12,CrayEX"), + pytest.param("0.9", "py3.8", False, True, id="0.9,python 3.8,HSN"), + pytest.param("0.9", "py3.9", False, True, id="0.9,python 3.9,HSN"), + pytest.param("0.9", "py3.10", False, True, id="0.9,python 3.10,HSN"), + pytest.param("0.9", "py3.11", False, True, id="0.9,python 3.11,HSN"), + pytest.param("0.9", "py3.12", False, True, id="0.9,python 3.12,HSN"), # Dragon V0.10 pytest.param("0.10", "py3.8", False, False, id="0.10,python 3.8"), pytest.param("0.10", "py3.9", True, False, id="0.10,python 3.9"), pytest.param("0.10", "py3.10", True, False, id="0.10,python 3.10"), pytest.param("0.10", "py3.11", True, False, id="0.10,python 3.11"), pytest.param("0.10", "py3.12", False, False, id="0.10,python 3.12"), - pytest.param("0.10", "py3.8", False, True, id="0.10,python 3.8,CrayEX"), - pytest.param("0.10", "py3.9", True, True, id="0.10,python 3.9,CrayEX"), - pytest.param("0.10", "py3.10", True, True, id="0.10,python 3.10,CrayEX"), - pytest.param("0.10", "py3.11", True, True, id="0.10,python 3.11,CrayEX"), - pytest.param("0.10", "py3.12", False, True, id="0.10,python 3.12,CrayEX"), + pytest.param("0.10", "py3.8", False, True, id="0.10,python 3.8,HSN"), + pytest.param("0.10", "py3.9", True, True, id="0.10,python 3.9,HSN"), + pytest.param("0.10", "py3.10", True, True, id="0.10,python 3.10,HSN"), + pytest.param("0.10", "py3.11", True, True, id="0.10,python 3.11,HSN"), + pytest.param("0.10", "py3.12", False, True, id="0.10,python 3.12,HSN"), # add a couple variants for a dragon version that isn't in the asset list pytest.param("0.7", "py3.9", False, False, id="0.7,python 3.9"), - pytest.param("0.7", "py3.9", False, True, id="0.7,python 3.9,CrayEX"), + pytest.param("0.7", "py3.9", False, True, id="0.7,python 3.9,HSN"), ], ) def test_retrieve_asset_info( @@ -234,10 +234,10 @@ def test_retrieve_asset_info( dragon_pin: str, pyv: str, is_found: bool, - is_crayex: bool, + is_hsn: bool, ) -> None: """Verify that an information is retrieved correctly based on the python - version, platform (e.g. CrayEX, !CrayEx), and target dragon pin""" + version, platform (e.g. HSN, !HSN), and target dragon pin""" with monkeypatch.context() as ctx: ctx.setattr( @@ -248,7 +248,7 @@ def test_retrieve_asset_info( ctx.setattr( smartsim._core._cli.scripts.dragon_install, "is_crayex_platform", - lambda: is_crayex, + lambda: is_hsn, ) ctx.setattr( smartsim._core._cli.scripts.dragon_install, @@ -269,7 +269,7 @@ def test_retrieve_asset_info( assert pyv in chosen_asset.name assert dragon_pin in chosen_asset.name - if is_crayex: + if is_hsn: assert "hsn" in chosen_asset.name.lower() else: assert "hsn" not in chosen_asset.name.lower() @@ -293,8 +293,8 @@ def test_check_for_utility_exists() -> None: assert utility -def test_is_crayex_missing_ldconfig(monkeypatch: pytest.MonkeyPatch) -> None: - """Ensure the cray ex platform check doesn't fail when ldconfig isn't +def test_is_hsn_missing_ldconfig(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure the HSN platform check doesn't fail when ldconfig isn't available for use""" def mock_util_check(util: str) -> str: @@ -310,12 +310,11 @@ def mock_util_check(util: str) -> str: mock_util_check, ) - is_cray = helpers.is_crayex_platform() - assert not is_cray + assert not helpers.is_crayex_platform() -def test_is_crayex_missing_fi_info(monkeypatch: pytest.MonkeyPatch) -> None: - """Ensure the cray ex platform check doesn't fail when fi_info isn't +def test_is_hsn_missing_fi_info(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure the HSN platform check doesn't fail when fi_info isn't available for use""" def mock_util_check(util: str) -> str: @@ -331,14 +330,13 @@ def mock_util_check(util: str) -> str: mock_util_check, ) - is_cray = helpers.is_crayex_platform() - assert not is_cray + assert not helpers.is_crayex_platform() @pytest.mark.parametrize( - "is_cray,output,return_code", + "is_hsn,output,return_code", [ - pytest.param(True, "cray pmi2.so\ncxi\ncray pmi.so\npni.so", 0, id="CrayEX"), + pytest.param(True, "cray pmi2.so\ncxi\ncray pmi.so\npni.so", 0, id="Cray PMI"), pytest.param(False, "cray pmi2.so\ncxi\npni.so", 0, id="No PMI"), pytest.param(False, "cxi\ncray pmi.so\npni.so", 0, id="No PMI 2"), pytest.param(False, "cray pmi2.so\ncray pmi.so\npni.so", 0, id="No CXI"), @@ -346,10 +344,10 @@ def mock_util_check(util: str) -> str: pytest.param(False, "cray pmi.so\npmi2.so\ncxi", 0, id="Non Cray PMI2"), ], ) -def test_is_cray_ex( - monkeypatch: pytest.MonkeyPatch, is_cray: bool, output: str, return_code: int +def test_is_hsn( + monkeypatch: pytest.MonkeyPatch, is_hsn: bool, output: str, return_code: int ) -> None: - """Test that cray ex platform check result is returned as expected""" + """Test that HSN platform check result is returned as expected""" def mock_util_check(util: str) -> bool: # mock that we have the necessary tools @@ -370,7 +368,7 @@ def mock_util_check(util: str) -> bool: ) platform_result = helpers.is_crayex_platform() - assert is_cray == platform_result + assert is_hsn == platform_result def test_install_package_no_wheel(extraction_dir: pathlib.Path): From 98e43c1530383ff41c5bc3699eb9940d7b066bdd Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Fri, 25 Oct 2024 11:06:15 -0700 Subject: [PATCH 06/16] CrayEx -> HSN in library --- smartsim/_core/_cli/scripts/dragon_install.py | 6 ++--- smartsim/_core/utils/__init__.py | 2 +- smartsim/_core/utils/helpers.py | 27 ++++++++++--------- tests/test_dragon_installer.py | 8 +++--- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index b3b1f6f982..c67d9f767b 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -9,7 +9,7 @@ from smartsim._core._cli.utils import pip from smartsim._core._install.utils import retrieve from smartsim._core.config import CONFIG -from smartsim._core.utils.helpers import check_platform, is_crayex_platform +from smartsim._core.utils.helpers import check_platform, is_hsn_platform from smartsim.error.errors import SmartSimCLIActionCancelled from smartsim.log import get_logger @@ -62,7 +62,7 @@ def _platform_filter(asset_name: str) -> bool: :returns: True if supplied value is correct for current platform""" key = "hsn" is_cray = key in asset_name.lower() - if is_crayex_platform(): + if is_hsn_platform(): return is_cray return not is_cray @@ -137,7 +137,7 @@ def retrieve_asset_info() -> GitReleaseAsset: asset = filter_assets(assets) platform_result = check_platform() - if not platform_result.is_cray: + if not platform_result.is_hsn: logger.warning("Installing Dragon without HSTA support") for msg in platform_result.failures: logger.warning(msg) diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py index cddbc4ce98..bd163854e5 100644 --- a/smartsim/_core/utils/__init__.py +++ b/smartsim/_core/utils/__init__.py @@ -31,6 +31,6 @@ execute_platform_cmd, expand_exe_path, installed_redisai_backends, - is_crayex_platform, + is_hsn_platform, ) from .redis import check_cluster_status, create_cluster, db_is_active diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index b17be763b4..185d44bdb0 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -250,7 +250,6 @@ def installed_redisai_backends( :param backends_path: path containing backends :return: list of installed RedisAI backends """ - # import here to avoid circular import base_path = redis_install_base(backends_path) backends: t.Set[_TRedisAIBackendStr] = { "tensorflow", @@ -318,7 +317,7 @@ def execute_platform_cmd(cmd: str) -> t.Tuple[str, int]: return process.stdout.decode("utf-8"), process.returncode -class CrayExPlatformResult: +class HSNPlatformResult: locate_msg = "Unable to locate `{0}`." def __init__(self, ldconfig: t.Optional[str], fi_info: t.Optional[str]) -> None: @@ -337,7 +336,7 @@ def has_fi_info(self) -> bool: return bool(self.fi_info) @property - def is_cray(self) -> bool: + def is_hsn(self) -> bool: return all( ( self.has_ldconfig, @@ -370,11 +369,12 @@ def failures(self) -> t.List[str]: return failure_messages -def check_platform() -> CrayExPlatformResult: - """Returns True if the current platform is identified as Cray EX and - HSTA-aware dragon package can be installed, False otherwise. +def check_platform() -> HSNPlatformResult: + """Queries the platform for system libraries to determine if the platform + has a compatible high speed network and an HSTA-aware dragon package can be + utilized. - :returns: True if current platform is Cray EX, False otherwise""" + :returns: A populated platform result""" # ldconfig -p | grep cray | grep pmi.so && # ldconfig -p | grep cray | grep pmi2.so && @@ -383,7 +383,7 @@ def check_platform() -> CrayExPlatformResult: ldconfig = check_for_utility("ldconfig") fi_info = check_for_utility("fi_info") - result = CrayExPlatformResult(ldconfig, fi_info) + result = HSNPlatformResult(ldconfig, fi_info) if not all((result.has_ldconfig, result.has_fi_info)): return result @@ -403,13 +403,14 @@ def check_platform() -> CrayExPlatformResult: return result -def is_crayex_platform() -> bool: - """Returns True if the current platform is identified as Cray EX and - HSTA-aware dragon package can be installed, False otherwise. +def is_hsn_platform() -> bool: + """Returns True if the current platform is identified as having a high + speed network and HSTA-aware dragon package can be installed, False + otherwise. - :returns: True if current platform is Cray EX, False otherwise""" + :returns: True if current platform is HSN compatible, False otherwise""" result = check_platform() - return result.is_cray + return result.is_hsn @t.final diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py index 33facb9560..e1bbc00448 100644 --- a/tests/test_dragon_installer.py +++ b/tests/test_dragon_installer.py @@ -247,7 +247,7 @@ def test_retrieve_asset_info( ) ctx.setattr( smartsim._core._cli.scripts.dragon_install, - "is_crayex_platform", + "is_hsn_platform", lambda: is_hsn, ) ctx.setattr( @@ -310,7 +310,7 @@ def mock_util_check(util: str) -> str: mock_util_check, ) - assert not helpers.is_crayex_platform() + assert not helpers.is_hsn_platform() def test_is_hsn_missing_fi_info(monkeypatch: pytest.MonkeyPatch) -> None: @@ -330,7 +330,7 @@ def mock_util_check(util: str) -> str: mock_util_check, ) - assert not helpers.is_crayex_platform() + assert not helpers.is_hsn_platform() @pytest.mark.parametrize( @@ -367,7 +367,7 @@ def mock_util_check(util: str) -> bool: lambda x: (output, return_code), ) - platform_result = helpers.is_crayex_platform() + platform_result = helpers.is_hsn_platform() assert is_hsn == platform_result From 0a0cccc0ffc67d4f83dff0dec96791ff501dc42d Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Mon, 28 Oct 2024 10:25:43 -0700 Subject: [PATCH 07/16] Docstrs --- .../_core/launcher/dragon/dragonBackend.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 078aa16c4a..37c6037b8a 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -84,24 +84,38 @@ def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.List[int]]: @property def puids(self) -> t.List[int]: - """List of Process UIDS belonging to the ProcessGroup""" + """List of Process IDs belonging to the ProcessGroup. + + :returns: List of Process IDs belonging to the ProcessGroup. + """ return list(set(itertools.chain(self.active_puids, self.inactive_puids))) @property def active_puids(self) -> t.List[int]: + """List of process IDs that are running. + + :returns: List of process IDs that are running. + """ if self.process_group is None: return [] return list(self.process_group.puids) @property def inactive_puids(self) -> t.List[int]: + """List of process IDs that have completed. + + :returns: List of process IDs that have completed. + """ if self.process_group is None: return [] return [puid for puid, _ in self.process_group.inactive_puids] @property def return_codes(self) -> t.List[int]: - """List of return codes of completed processes""" + """List of return codes of completed processes. + + :returns: List of return codes of completed processes. + """ if self.process_group is None: return [-1] if self.status == SmartSimStatus.STATUS_CANCELLED: From 599b1f971b7dc94b320d15d2bae0d071adfcaeb6 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Mon, 28 Oct 2024 10:32:30 -0700 Subject: [PATCH 08/16] Special return codes enshrined as constants --- smartsim/_core/launcher/dragon/dragonBackend.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 37c6037b8a..8eee8b345c 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -66,6 +66,10 @@ logger = get_logger(__name__) +_RETURN_CODES_NO_PROCESS_GROUP: t.Final = [-1] +_RETURN_CODES_PROCESS_GROUP_CANCELLED: t.Final = [-9] + + @dataclass class ProcessGroupInfo: status: SmartSimStatus @@ -117,9 +121,9 @@ def return_codes(self) -> t.List[int]: :returns: List of return codes of completed processes. """ if self.process_group is None: - return [-1] + return _RETURN_CODES_NO_PROCESS_GROUP if self.status == SmartSimStatus.STATUS_CANCELLED: - return [-9] + return _RETURN_CODES_PROCESS_GROUP_CANCELLED return [ret for _, ret in self.process_group.inactive_puids] def __str__(self) -> str: From a2d9d6bb4d46b5d5209af45ace5a4ed2bc8119d6 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Mon, 28 Oct 2024 10:34:19 -0700 Subject: [PATCH 09/16] Release process group to match resource util behavior --- smartsim/_core/launcher/dragon/dragonBackend.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 8eee8b345c..f286eb56e0 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -80,6 +80,10 @@ class ProcessGroupInfo: """List of hosts on which the Process Group """ redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None """Workers used to redirect stdout and stderr to file""" + _final_return_codes: t.Optional[t.List[int]] = field(default=None, init=False) + """Field to cache final statuses when a process group info is marked as + completed so that the underlying process group can be released. + """ @property def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.List[int]]: @@ -120,6 +124,8 @@ def return_codes(self) -> t.List[int]: :returns: List of return codes of completed processes. """ + if self._final_return_codes is not None: + return self._final_return_codes if self.process_group is None: return _RETURN_CODES_NO_PROCESS_GROUP if self.status == SmartSimStatus.STATUS_CANCELLED: @@ -141,6 +147,14 @@ def __str__(self) -> str: return ", ".join(msg) + def mark_complete(self) -> None: + """Cached the final return codes and release any underlying dragon + process groups. + """ + self._final_return_codes = self.return_codes + self.process_group = None + self.redir_workers = None + # Thanks to Colin Wahl from HPE HPC Dragon Team def redir_worker(io_conn: dragon_connection.Connection, file_path: str) -> None: @@ -624,8 +638,7 @@ def _refresh_statuses(self) -> None: except KeyError: logger.error(f"Tried to free a non-allocated host: {host}") self._free_hosts.append(host) - # group_info.process_group = None - group_info.redir_workers = None + group_info.mark_complete() def _update_shutdown_status(self) -> None: self._heartbeat() From 3a873f56b438529d6aa05fca3fc4052a48929705 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Mon, 28 Oct 2024 10:46:40 -0700 Subject: [PATCH 10/16] Typo --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index f286eb56e0..32c4d1c039 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -148,7 +148,7 @@ def __str__(self) -> str: return ", ".join(msg) def mark_complete(self) -> None: - """Cached the final return codes and release any underlying dragon + """Cache the final return codes and release any underlying dragon process groups. """ self._final_return_codes = self.return_codes From c8f6decf36f9c1a148c1ae8739e67abcc82e7d5c Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Wed, 30 Oct 2024 16:50:50 -0500 Subject: [PATCH 11/16] Prevent dragon processes from hanging Co-authored-by: Al Rigazzi --- smartsim/_core/launcher/dragon/dragonBackend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 32c4d1c039..9fea9ddfc4 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -152,7 +152,11 @@ def mark_complete(self) -> None: process groups. """ self._final_return_codes = self.return_codes + self.process_group.join() + self.process_group.close() self.process_group = None + self.redir_workers.join() + self.redir_workers.close() self.redir_workers = None From f40dc1d93e5c5d29a8d9b2118119be54c198183f Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Wed, 30 Oct 2024 17:18:05 -0500 Subject: [PATCH 12/16] Re-order for type check --- smartsim/_core/launcher/dragon/dragonBackend.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 9fea9ddfc4..5300f6d3a5 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -151,13 +151,15 @@ def mark_complete(self) -> None: """Cache the final return codes and release any underlying dragon process groups. """ - self._final_return_codes = self.return_codes - self.process_group.join() - self.process_group.close() - self.process_group = None - self.redir_workers.join() - self.redir_workers.close() - self.redir_workers = None + if self.process_group is not None: + self.process_group.join() + self._final_return_codes = self.return_codes + self.process_group.close() + self.process_group = None + if self.redir_workers is not None: + self.redir_workers.join() + self.redir_workers.close() + self.redir_workers = None # Thanks to Colin Wahl from HPE HPC Dragon Team From e557228cc60664019c06d83e78570ba84adf1fd8 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Wed, 30 Oct 2024 17:18:29 -0500 Subject: [PATCH 13/16] Use system status codes when cancelled --- smartsim/_core/launcher/dragon/dragonBackend.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 5300f6d3a5..5a7a28dd86 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -67,7 +67,6 @@ _RETURN_CODES_NO_PROCESS_GROUP: t.Final = [-1] -_RETURN_CODES_PROCESS_GROUP_CANCELLED: t.Final = [-9] @dataclass @@ -128,8 +127,6 @@ def return_codes(self) -> t.List[int]: return self._final_return_codes if self.process_group is None: return _RETURN_CODES_NO_PROCESS_GROUP - if self.status == SmartSimStatus.STATUS_CANCELLED: - return _RETURN_CODES_PROCESS_GROUP_CANCELLED return [ret for _, ret in self.process_group.inactive_puids] def __str__(self) -> str: From 60d8e73b7d0104d49dd40340f6faeda71d682cb5 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Wed, 30 Oct 2024 17:14:18 -0500 Subject: [PATCH 14/16] Remove `CRAYEX-` from mock assets --- tests/test_dragon_installer.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py index e1bbc00448..9fa06be508 100644 --- a/tests/test_dragon_installer.py +++ b/tests/test_dragon_installer.py @@ -103,13 +103,8 @@ def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset] assets: t.List[GitReleaseAsset] = [] mock_archive_name_tpl = "{}-{}.4.1-{}ac132fe95.tar.gz" - for python_version, dragon_version, platform in itertools.chain( - itertools.product( - ["py3.9", "py3.10", "py3.11"], ["dragon-0.8", "dragon-0.9"], ["", "CRAYEX-"] - ), - itertools.product( - ["py3.9", "py3.10", "py3.11"], ["dragon-0.10", "dragon-0.11"], ["", "HSN-"] - ), + for python_version, dragon_version, platform in itertools.product( + ["py3.9", "py3.10", "py3.11"], ["dragon-0.10", "dragon-0.11"], ["", "HSN-"] ): asset = GitReleaseAsset(requester, headers, attributes, completed) archive_name = mock_archive_name_tpl.format( @@ -192,9 +187,9 @@ def test_retrieve_cached( [ # Dragon V0.8 pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8"), - pytest.param("0.8", "py3.9", True, False, id="0.8,python 3.9"), - pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10"), - pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11"), + pytest.param("0.8", "py3.9", False, False, id="0.8,python 3.9"), + pytest.param("0.8", "py3.10", False, False, id="0.8,python 3.10"), + pytest.param("0.8", "py3.11", False, False, id="0.8,python 3.11"), pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12"), pytest.param("0.8", "py3.8", False, True, id="0.8,python 3.8,HSN"), pytest.param("0.8", "py3.9", False, True, id="0.8,python 3.9,HSN"), @@ -203,9 +198,9 @@ def test_retrieve_cached( pytest.param("0.8", "py3.12", False, True, id="0.8,python 3.12,HSN"), # Dragon V0.9 pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8"), - pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9"), - pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10"), - pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11"), + pytest.param("0.9", "py3.9", False, False, id="0.9,python 3.9"), + pytest.param("0.9", "py3.10", False, False, id="0.9,python 3.10"), + pytest.param("0.9", "py3.11", False, False, id="0.9,python 3.11"), pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12"), pytest.param("0.9", "py3.8", False, True, id="0.9,python 3.8,HSN"), pytest.param("0.9", "py3.9", False, True, id="0.9,python 3.9,HSN"), From 216b2dfd068608e33e7283f3669afcd40eb7e6eb Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Fri, 1 Nov 2024 15:23:00 -0500 Subject: [PATCH 15/16] Do not raise and stop backend on nonzero exit --- smartsim/_core/launcher/dragon/dragonBackend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 5a7a28dd86..971c60a6f2 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -409,7 +409,7 @@ def _create_redirect_workers( err_file: t.Optional[str], ) -> dragon_process_group.ProcessGroup: grp_redir = dragon_process_group.ProcessGroup( - restart=False, policy=global_policy, pmi_enabled=False + restart=False, ignore_error_on_exit=True, policy=global_policy, pmi_enabled=False ) for pol, puid in zip(policies, puids): proc = dragon_process.Process(None, ident=puid) @@ -528,7 +528,7 @@ def _start_steps(self) -> None: host_name=hosts[0], ) grp = dragon_process_group.ProcessGroup( - restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy + restart=False, ignore_error_on_exit=True, pmi_enabled=request.pmi_enabled, policy=global_policy ) policies = [] From eb7639b3b89a01b50f569fe892eaea00a729267b Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Fri, 1 Nov 2024 15:44:15 -0500 Subject: [PATCH 16/16] Style --- smartsim/_core/launcher/dragon/dragonBackend.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 971c60a6f2..da59d9f24a 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -409,7 +409,10 @@ def _create_redirect_workers( err_file: t.Optional[str], ) -> dragon_process_group.ProcessGroup: grp_redir = dragon_process_group.ProcessGroup( - restart=False, ignore_error_on_exit=True, policy=global_policy, pmi_enabled=False + restart=False, + ignore_error_on_exit=True, + policy=global_policy, + pmi_enabled=False, ) for pol, puid in zip(policies, puids): proc = dragon_process.Process(None, ident=puid) @@ -528,7 +531,10 @@ def _start_steps(self) -> None: host_name=hosts[0], ) grp = dragon_process_group.ProcessGroup( - restart=False, ignore_error_on_exit=True, pmi_enabled=request.pmi_enabled, policy=global_policy + restart=False, + ignore_error_on_exit=True, + pmi_enabled=request.pmi_enabled, + policy=global_policy, ) policies = []