From ab69e0ced2ff93f5ae1498d20f508e866204d96f Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Wed, 16 Oct 2024 16:52:16 -0500
Subject: [PATCH 01/16] [skip ci] Remove use of dead dragon attrs

---
 .../_core/launcher/dragon/dragonBackend.py    | 124 ++++++++----------
 1 file changed, 53 insertions(+), 71 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 4aba60d558..1338cb0e38 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -23,6 +23,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import collections
 import functools
 import itertools
@@ -38,7 +39,6 @@
 # isort: off
 import dragon.infrastructure.connection as dragon_connection
 import dragon.infrastructure.policy as dragon_policy
-import dragon.native.group_state as dragon_group_state
 import dragon.native.process as dragon_process
 import dragon.native.process_group as dragon_process_group
 import dragon.native.machine as dragon_machine
@@ -67,34 +67,48 @@
 logger = get_logger(__name__)
 
 
-class DragonStatus(str, Enum):
-    ERROR = str(dragon_group_state.Error())
-    RUNNING = str(dragon_group_state.Running())
-
-    def __str__(self) -> str:
-        return self.value
-
-
 @dataclass
 class ProcessGroupInfo:
     status: SmartSimStatus
     """Status of step"""
     process_group: t.Optional[dragon_process_group.ProcessGroup] = None
     """Internal Process Group object, None for finished or not started steps"""
-    puids: t.Optional[t.List[t.Optional[int]]] = None  # puids can be None
-    """List of Process UIDS belonging to the ProcessGroup"""
-    return_codes: t.Optional[t.List[int]] = None
-    """List of return codes of completed processes"""
     hosts: t.List[str] = field(default_factory=list)
     """List of hosts on which the Process Group """
     redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None
     """Workers used to redirect stdout and stderr to file"""
 
     @property
-    def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.Optional[t.List[int]]]:
+    def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.List[int]]:
         """Information needed by SmartSim Launcher and Job Manager"""
         return (self.status, self.return_codes)
 
+    @property
+    def puids(self) -> t.List[int]:
+        """List of Process UIDS belonging to the ProcessGroup"""
+        return list(set(itertools.chain(self.active_puids, self.inactive_puids)))
+
+    @property
+    def active_puids(self) -> t.List[int]:
+        if self.process_group is None:
+            return []
+        return list(self.process_group.puids)
+
+    @property
+    def inactive_puids(self) -> t.List[int]:
+        if self.process_group is None:
+            return []
+        return [puid for puid, _ in self.process_group.inactive_puids]
+
+    @property
+    def return_codes(self) -> t.List[int]:
+        """List of return codes of completed processes"""
+        if self.process_group is None:
+            return [-1]
+        if self.status == SmartSimStatus.STATUS_CANCELLED:
+            return [-9]
+        return [ret for _, ret in self.process_group.inactive_puids]
+
     def __str__(self) -> str:
         if self.process_group is not None and self.redir_workers is not None:
             msg = [f"Active Group ({self.status})"]
@@ -105,7 +119,7 @@ def __str__(self) -> str:
 
         if self.hosts is not None:
             msg.append(f"Hosts: {','.join(self.hosts)}")
-        if self.return_codes is not None:
+        if self.return_codes:
             msg.append(f"{self.return_codes}")
 
         return ", ".join(msg)
@@ -404,10 +418,10 @@ def _stop_steps(self) -> None:
                 else:
                     # Technically we could just terminate, but what if
                     # the application intercepts that and ignores it?
-                    proc_group = self._group_infos[step_id].process_group
+                    group_info = self._group_infos[step_id]
                     if (
-                        proc_group is not None
-                        and proc_group.status == DragonStatus.RUNNING
+                        group_info.active_puids
+                        and (proc_group := group_info.process_group) is not None
                     ):
                         try:
                             proc_group.kill()
@@ -416,7 +430,7 @@ def _stop_steps(self) -> None:
                                 proc_group.stop()
                             except dragon_process_group.DragonProcessGroupError:
                                 logger.error("Process group already stopped")
-                    redir_group = self._group_infos[step_id].redir_workers
+                    redir_group = group_info.redir_workers
                     if redir_group is not None:
                         try:
                             redir_group.join(0.1)
@@ -425,7 +439,6 @@ def _stop_steps(self) -> None:
                             logger.error(e)
 
                 self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
-                self._group_infos[step_id].return_codes = [-9]
 
     @staticmethod
     def create_run_policy(
@@ -438,7 +451,6 @@ def create_run_policy(
         if isinstance(request, DragonRunRequest):
             run_request: DragonRunRequest = request
 
-            affinity = dragon_policy.Policy.Affinity.DEFAULT
             cpu_affinity: t.List[int] = []
             gpu_affinity: t.List[int] = []
 
@@ -446,25 +458,20 @@ def create_run_policy(
             if run_request.policy is not None:
                 # Affinities are not mutually exclusive. If specified, both are used
                 if run_request.policy.cpu_affinity:
-                    affinity = dragon_policy.Policy.Affinity.SPECIFIC
                     cpu_affinity = run_request.policy.cpu_affinity
 
                 if run_request.policy.gpu_affinity:
-                    affinity = dragon_policy.Policy.Affinity.SPECIFIC
                     gpu_affinity = run_request.policy.gpu_affinity
             logger.debug(
-                f"Affinity strategy: {affinity}, "
                 f"CPU affinity mask: {cpu_affinity}, "
                 f"GPU affinity mask: {gpu_affinity}"
             )
-            if affinity != dragon_policy.Policy.Affinity.DEFAULT:
-                return dragon_policy.Policy(
-                    placement=dragon_policy.Policy.Placement.HOST_NAME,
-                    host_name=node_name,
-                    affinity=affinity,
-                    cpu_affinity=cpu_affinity,
-                    gpu_affinity=gpu_affinity,
-                )
+            return dragon_policy.Policy(
+                placement=dragon_policy.Policy.Placement.HOST_NAME,
+                host_name=node_name,
+                cpu_affinity=cpu_affinity,
+                gpu_affinity=gpu_affinity,
+            )
 
         return dragon_policy.Policy(
             placement=dragon_policy.Policy.Placement.HOST_NAME,
@@ -513,22 +520,19 @@ def _start_steps(self) -> None:
                     logger.error(e)
                     grp_status = SmartSimStatus.STATUS_FAILED
 
-                puids = None
                 try:
-                    puids = list(
-                        set(grp.puids + [puid for puid, retcode in grp.inactive_puids])
-                    )
-                    self._group_infos[step_id] = ProcessGroupInfo(
+                    grp_info = ProcessGroupInfo(
                         process_group=grp,
-                        puids=puids,
-                        return_codes=[],
                         status=grp_status,
                         hosts=hosts,
                     )
+                    puids = grp_info.puids
+                    self._group_infos[step_id] = grp_info
                     self._running_steps.append(step_id)
                     started.append(step_id)
                 except Exception as e:
                     logger.error(e)
+                    puids = None
 
                 if (
                     puids is not None
@@ -575,32 +579,15 @@ def _refresh_statuses(self) -> None:
                 grp = group_info.process_group
                 if grp is None:
                     group_info.status = SmartSimStatus.STATUS_FAILED
-                    group_info.return_codes = [-1]
                 elif group_info.status not in TERMINAL_STATUSES:
-                    if grp.status == str(DragonStatus.RUNNING):
+                    if group_info.active_puids:
                         group_info.status = SmartSimStatus.STATUS_RUNNING
-                    else:
-                        puids = group_info.puids
-                        if puids is not None and all(
-                            puid is not None for puid in puids
-                        ):
-                            try:
-                                group_info.return_codes = [
-                                    dragon_process.Process(None, ident=puid).returncode
-                                    for puid in puids
-                                ]
-                            except (ValueError, TypeError) as e:
-                                logger.error(e)
-                                group_info.return_codes = [-1 for _ in puids]
-                        else:
-                            group_info.return_codes = [0]
-                        if not group_info.status == SmartSimStatus.STATUS_CANCELLED:
-                            group_info.status = (
-                                SmartSimStatus.STATUS_FAILED
-                                if any(group_info.return_codes)
-                                or grp.status == DragonStatus.ERROR
-                                else SmartSimStatus.STATUS_COMPLETED
-                            )
+                    elif group_info.status != SmartSimStatus.STATUS_CANCELLED:
+                        group_info.status = (
+                            SmartSimStatus.STATUS_FAILED
+                            if any(group_info.return_codes)
+                            else SmartSimStatus.STATUS_COMPLETED
+                        )
 
                 if group_info.status in TERMINAL_STATUSES:
                     terminated.append(step_id)
@@ -620,7 +607,7 @@ def _refresh_statuses(self) -> None:
                         except KeyError:
                             logger.error(f"Tried to free a non-allocated host: {host}")
                         self._free_hosts.append(host)
-                    group_info.process_group = None
+                    # group_info.process_group = None
                     group_info.redir_workers = None
 
     def _update_shutdown_status(self) -> None:
@@ -685,7 +672,7 @@ def _(self, request: DragonRunRequest) -> DragonRunResponse:
             honorable, err = self._can_honor(request)
             if not honorable:
                 self._group_infos[step_id] = ProcessGroupInfo(
-                    status=SmartSimStatus.STATUS_FAILED, return_codes=[-1]
+                    status=SmartSimStatus.STATUS_FAILED
                 )
             else:
                 self._queued_steps[step_id] = request
@@ -751,12 +738,7 @@ def _proc_group_info_table_line(
         else:
             table_line.append("")
 
-        if proc_group_info.return_codes is not None:
-            table_line.append(
-                f"{','.join(str(ret) for ret in proc_group_info.return_codes)}"
-            )
-        else:
-            table_line.append("")
+        table_line.append(",".join(str(ret) for ret in proc_group_info.return_codes))
 
         if proc_group_info.puids is not None:
             table_line.append(f"{len(proc_group_info.puids)}")

From 4456a21992b55506ec424fc9e780c8d638874d05 Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Fri, 18 Oct 2024 13:04:06 -0500
Subject: [PATCH 02/16] Pull the correct dragon version

---
 smartsim/_core/_cli/scripts/dragon_install.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index 8028b8ecfd..65f80654e1 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -51,7 +51,7 @@ def python_version() -> str:
 def dragon_pin() -> str:
     """Return a string indicating the pinned major/minor version of the dragon
     package to install"""
-    return "0.9"
+    return "0.10"
 
 
 def _platform_filter(asset_name: str) -> bool:
@@ -60,7 +60,7 @@ def _platform_filter(asset_name: str) -> bool:
 
     :param asset_name: A value to inspect for keywords indicating a Cray EX asset
     :returns: True if supplied value is correct for current platform"""
-    key = "crayex"
+    key = "hsn"
     is_cray = key in asset_name.lower()
     if is_crayex_platform():
         return is_cray

From de40bc9a0c72e003df9e13d1a26b772031992a1d Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Fri, 18 Oct 2024 14:11:12 -0500
Subject: [PATCH 03/16] Make CI happy

---
 doc/changelog.md                                | 7 +++++++
 smartsim/_core/launcher/dragon/dragonBackend.py | 1 -
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/changelog.md b/doc/changelog.md
index 8f93a1ae2c..18967708fc 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -13,11 +13,18 @@ To be released at some point in the future
 
 Description
 
+- Update the `DragonBackend` to use
+  [Dragon V0.10](https://github.com/DragonHPC/dragon/releases/tag/v0.10-beta)
 - Implement workaround for Tensorflow that allows RedisAI to build with GCC-14
 - Add instructions for installing SmartSim on PML's Scylla
 
 Detailed Notes
 
+- Dragon V0.10 introduced support for infiniband networks and largely
+  overhauled the ``ProcessGroup`` API, used widely throughout SmartSim's
+  ``DragonBackend``, for better readability and debugging.  SmartSim has has
+  adopted this new version of Dragon to take advantage of these improvements.
+  ([SmartSim-PR753](https://github.com/CrayLabs/SmartSim/pull/753))
 - In libtensorflow, the input argument to TF_SessionRun seems to be mistyped to
   TF_Output instead of TF_Input. These two types differ only in name. GCC-14
   catches this and throws an error, even though earlier versions allow this. To
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 1338cb0e38..078aa16c4a 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -30,7 +30,6 @@
 import time
 import typing as t
 from dataclasses import dataclass, field
-from enum import Enum
 from threading import RLock
 
 from tabulate import tabulate

From 890015723b81b5b04fedc5dae2aee5510c65d6d4 Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Fri, 18 Oct 2024 15:04:53 -0500
Subject: [PATCH 04/16] Make tests happy

---
 smartsim/_core/_cli/scripts/dragon_install.py |  1 -
 tests/test_dragon_installer.py                | 74 +++++++++++--------
 2 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index 65f80654e1..b3b1f6f982 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -132,7 +132,6 @@ def filter_assets(assets: t.Collection[GitReleaseAsset]) -> t.Optional[GitReleas
 def retrieve_asset_info() -> GitReleaseAsset:
     """Find a release asset that meets all necessary filtering criteria
 
-    :param dragon_pin: identify the dragon version to install (e.g. dragon-0.8)
     :returns: A GitHub release asset"""
     assets = _get_release_assets()
     asset = filter_assets(assets)
diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py
index b23a1a7ef0..a2a72000f9 100644
--- a/tests/test_dragon_installer.py
+++ b/tests/test_dragon_installer.py
@@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import itertools
 import pathlib
 import sys
 import tarfile
@@ -102,23 +103,25 @@ def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset]
     assets: t.List[GitReleaseAsset] = []
     mock_archive_name_tpl = "{}-{}.4.1-{}ac132fe95.tar.gz"
 
-    for python_version in ["py3.9", "py3.10", "py3.11"]:
-        for dragon_version in ["dragon-0.8", "dragon-0.9", "dragon-0.10"]:
-            for platform in ["", "CRAYEX-"]:
-
-                asset = GitReleaseAsset(requester, headers, attributes, completed)
-
-                archive_name = mock_archive_name_tpl.format(
-                    dragon_version, python_version, platform
-                )
-
-                monkeypatch.setattr(
-                    asset,
-                    "_browser_download_url",
-                    _git_attr(value=f"http://foo/{archive_name}"),
-                )
-                monkeypatch.setattr(asset, "_name", _git_attr(value=archive_name))
-                assets.append(asset)
+    for python_version, dragon_version, platform in itertools.chain(
+        itertools.product(
+            ["py3.9", "py3.10", "py3.11"], ["dragon-0.8", "dragon-0.9"], ["", "CRAYEX-"]
+        ),
+        itertools.product(
+            ["py3.9", "py3.10", "py3.11"], ["dragon-0.10", "dragon-0.11"], ["", "HSN-"]
+        ),
+    ):
+        asset = GitReleaseAsset(requester, headers, attributes, completed)
+        archive_name = mock_archive_name_tpl.format(
+            dragon_version, python_version, platform
+        )
+        monkeypatch.setattr(
+            asset,
+            "_browser_download_url",
+            _git_attr(value=f"http://foo/{archive_name}"),
+        )
+        monkeypatch.setattr(asset, "_name", _git_attr(value=archive_name))
+        assets.append(asset)
 
     return assets
 
@@ -187,26 +190,39 @@ def test_retrieve_cached(
 @pytest.mark.parametrize(
     "dragon_pin,pyv,is_found,is_crayex",
     [
+        # Dragon V0.8
         pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8"),
         pytest.param("0.8", "py3.9", True, False, id="0.8,python 3.9"),
         pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10"),
         pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11"),
         pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12"),
-        pytest.param("0.8", "py3.8", False, True, id="0.8,python 3.8,CrayEX"),
-        pytest.param("0.8", "py3.9", True, True, id="0.8,python 3.9,CrayEX"),
-        pytest.param("0.8", "py3.10", True, True, id="0.8,python 3.10,CrayEX"),
-        pytest.param("0.8", "py3.11", True, True, id="0.8,python 3.11,CrayEX"),
-        pytest.param("0.8", "py3.12", False, True, id="0.8,python 3.12,CrayEX"),
+        pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8,CrayEX"),
+        pytest.param("0.8", "py3.9", True, False, id="0.8,python 3.9,CrayEX"),
+        pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10,CrayEX"),
+        pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11,CrayEX"),
+        pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12,CrayEX"),
+        # Dragon V0.9
         pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8"),
         pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9"),
         pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10"),
         pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11"),
         pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12"),
-        pytest.param("0.9", "py3.8", False, True, id="0.9,python 3.8,CrayEX"),
-        pytest.param("0.9", "py3.9", True, True, id="0.9,python 3.9,CrayEX"),
-        pytest.param("0.9", "py3.10", True, True, id="0.9,python 3.10,CrayEX"),
-        pytest.param("0.9", "py3.11", True, True, id="0.9,python 3.11,CrayEX"),
-        pytest.param("0.9", "py3.12", False, True, id="0.9,python 3.12,CrayEX"),
+        pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8,CrayEX"),
+        pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9,CrayEX"),
+        pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10,CrayEX"),
+        pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11,CrayEX"),
+        pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12,CrayEX"),
+        # Dragon V0.10
+        pytest.param("0.10", "py3.8", False, False, id="0.10,python 3.8"),
+        pytest.param("0.10", "py3.9", True, False, id="0.10,python 3.9"),
+        pytest.param("0.10", "py3.10", True, False, id="0.10,python 3.10"),
+        pytest.param("0.10", "py3.11", True, False, id="0.10,python 3.11"),
+        pytest.param("0.10", "py3.12", False, False, id="0.10,python 3.12"),
+        pytest.param("0.10", "py3.8", False, True, id="0.10,python 3.8,CrayEX"),
+        pytest.param("0.10", "py3.9", True, True, id="0.10,python 3.9,CrayEX"),
+        pytest.param("0.10", "py3.10", True, True, id="0.10,python 3.10,CrayEX"),
+        pytest.param("0.10", "py3.11", True, True, id="0.10,python 3.11,CrayEX"),
+        pytest.param("0.10", "py3.12", False, True, id="0.10,python 3.12,CrayEX"),
         # add a couple variants for a dragon version that isn't in the asset list
         pytest.param("0.7", "py3.9", False, False, id="0.7,python 3.9"),
         pytest.param("0.7", "py3.9", False, True, id="0.7,python 3.9,CrayEX"),
@@ -254,9 +270,9 @@ def test_retrieve_asset_info(
             assert dragon_pin in chosen_asset.name
 
             if is_crayex:
-                assert "crayex" in chosen_asset.name.lower()
+                assert "hsn" in chosen_asset.name.lower()
             else:
-                assert "crayex" not in chosen_asset.name.lower()
+                assert "hsn" not in chosen_asset.name.lower()
         else:
             with pytest.raises(SmartSimCLIActionCancelled):
                 retrieve_asset_info()

From 11f3be281915445982053167c985aa34757372ca Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Fri, 25 Oct 2024 10:43:36 -0700
Subject: [PATCH 05/16] CrayEx -> HSN in tests

---
 tests/test_dragon_installer.py | 70 +++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 36 deletions(-)

diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py
index a2a72000f9..33facb9560 100644
--- a/tests/test_dragon_installer.py
+++ b/tests/test_dragon_installer.py
@@ -51,7 +51,7 @@
 pytestmark = pytest.mark.group_a
 
 
-mock_archive_name = "dragon-0.8-py3.9.4.1-CRAYEX-ac132fe95.tar.gz"
+mock_archive_name = "dragon-0.10-py3.9.4.1-HSN-ac132fe95.tar.gz"
 _git_attr = namedtuple("_git_attr", "value")
 
 
@@ -188,7 +188,7 @@ def test_retrieve_cached(
 
 
 @pytest.mark.parametrize(
-    "dragon_pin,pyv,is_found,is_crayex",
+    "dragon_pin,pyv,is_found,is_hsn",
     [
         # Dragon V0.8
         pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8"),
@@ -196,36 +196,36 @@ def test_retrieve_cached(
         pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10"),
         pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11"),
         pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12"),
-        pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8,CrayEX"),
-        pytest.param("0.8", "py3.9", True, False, id="0.8,python 3.9,CrayEX"),
-        pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10,CrayEX"),
-        pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11,CrayEX"),
-        pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12,CrayEX"),
+        pytest.param("0.8", "py3.8", False, True, id="0.8,python 3.8,HSN"),
+        pytest.param("0.8", "py3.9", False, True, id="0.8,python 3.9,HSN"),
+        pytest.param("0.8", "py3.10", False, True, id="0.8,python 3.10,HSN"),
+        pytest.param("0.8", "py3.11", False, True, id="0.8,python 3.11,HSN"),
+        pytest.param("0.8", "py3.12", False, True, id="0.8,python 3.12,HSN"),
         # Dragon V0.9
         pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8"),
         pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9"),
         pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10"),
         pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11"),
         pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12"),
-        pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8,CrayEX"),
-        pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9,CrayEX"),
-        pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10,CrayEX"),
-        pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11,CrayEX"),
-        pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12,CrayEX"),
+        pytest.param("0.9", "py3.8", False, True, id="0.9,python 3.8,HSN"),
+        pytest.param("0.9", "py3.9", False, True, id="0.9,python 3.9,HSN"),
+        pytest.param("0.9", "py3.10", False, True, id="0.9,python 3.10,HSN"),
+        pytest.param("0.9", "py3.11", False, True, id="0.9,python 3.11,HSN"),
+        pytest.param("0.9", "py3.12", False, True, id="0.9,python 3.12,HSN"),
         # Dragon V0.10
         pytest.param("0.10", "py3.8", False, False, id="0.10,python 3.8"),
         pytest.param("0.10", "py3.9", True, False, id="0.10,python 3.9"),
         pytest.param("0.10", "py3.10", True, False, id="0.10,python 3.10"),
         pytest.param("0.10", "py3.11", True, False, id="0.10,python 3.11"),
         pytest.param("0.10", "py3.12", False, False, id="0.10,python 3.12"),
-        pytest.param("0.10", "py3.8", False, True, id="0.10,python 3.8,CrayEX"),
-        pytest.param("0.10", "py3.9", True, True, id="0.10,python 3.9,CrayEX"),
-        pytest.param("0.10", "py3.10", True, True, id="0.10,python 3.10,CrayEX"),
-        pytest.param("0.10", "py3.11", True, True, id="0.10,python 3.11,CrayEX"),
-        pytest.param("0.10", "py3.12", False, True, id="0.10,python 3.12,CrayEX"),
+        pytest.param("0.10", "py3.8", False, True, id="0.10,python 3.8,HSN"),
+        pytest.param("0.10", "py3.9", True, True, id="0.10,python 3.9,HSN"),
+        pytest.param("0.10", "py3.10", True, True, id="0.10,python 3.10,HSN"),
+        pytest.param("0.10", "py3.11", True, True, id="0.10,python 3.11,HSN"),
+        pytest.param("0.10", "py3.12", False, True, id="0.10,python 3.12,HSN"),
         # add a couple variants for a dragon version that isn't in the asset list
         pytest.param("0.7", "py3.9", False, False, id="0.7,python 3.9"),
-        pytest.param("0.7", "py3.9", False, True, id="0.7,python 3.9,CrayEX"),
+        pytest.param("0.7", "py3.9", False, True, id="0.7,python 3.9,HSN"),
     ],
 )
 def test_retrieve_asset_info(
@@ -234,10 +234,10 @@ def test_retrieve_asset_info(
     dragon_pin: str,
     pyv: str,
     is_found: bool,
-    is_crayex: bool,
+    is_hsn: bool,
 ) -> None:
     """Verify that an information is retrieved correctly based on the python
-    version, platform (e.g. CrayEX, !CrayEx), and target dragon pin"""
+    version, platform (e.g. HSN, !HSN), and target dragon pin"""
 
     with monkeypatch.context() as ctx:
         ctx.setattr(
@@ -248,7 +248,7 @@ def test_retrieve_asset_info(
         ctx.setattr(
             smartsim._core._cli.scripts.dragon_install,
             "is_crayex_platform",
-            lambda: is_crayex,
+            lambda: is_hsn,
         )
         ctx.setattr(
             smartsim._core._cli.scripts.dragon_install,
@@ -269,7 +269,7 @@ def test_retrieve_asset_info(
             assert pyv in chosen_asset.name
             assert dragon_pin in chosen_asset.name
 
-            if is_crayex:
+            if is_hsn:
                 assert "hsn" in chosen_asset.name.lower()
             else:
                 assert "hsn" not in chosen_asset.name.lower()
@@ -293,8 +293,8 @@ def test_check_for_utility_exists() -> None:
     assert utility
 
 
-def test_is_crayex_missing_ldconfig(monkeypatch: pytest.MonkeyPatch) -> None:
-    """Ensure the cray ex platform check doesn't fail when ldconfig isn't
+def test_is_hsn_missing_ldconfig(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Ensure the HSN platform check doesn't fail when ldconfig isn't
     available for use"""
 
     def mock_util_check(util: str) -> str:
@@ -310,12 +310,11 @@ def mock_util_check(util: str) -> str:
             mock_util_check,
         )
 
-        is_cray = helpers.is_crayex_platform()
-        assert not is_cray
+        assert not helpers.is_crayex_platform()
 
 
-def test_is_crayex_missing_fi_info(monkeypatch: pytest.MonkeyPatch) -> None:
-    """Ensure the cray ex platform check doesn't fail when fi_info isn't
+def test_is_hsn_missing_fi_info(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Ensure the HSN platform check doesn't fail when fi_info isn't
     available for use"""
 
     def mock_util_check(util: str) -> str:
@@ -331,14 +330,13 @@ def mock_util_check(util: str) -> str:
             mock_util_check,
         )
 
-        is_cray = helpers.is_crayex_platform()
-        assert not is_cray
+        assert not helpers.is_crayex_platform()
 
 
 @pytest.mark.parametrize(
-    "is_cray,output,return_code",
+    "is_hsn,output,return_code",
     [
-        pytest.param(True, "cray pmi2.so\ncxi\ncray pmi.so\npni.so", 0, id="CrayEX"),
+        pytest.param(True, "cray pmi2.so\ncxi\ncray pmi.so\npni.so", 0, id="Cray PMI"),
         pytest.param(False, "cray pmi2.so\ncxi\npni.so", 0, id="No PMI"),
         pytest.param(False, "cxi\ncray pmi.so\npni.so", 0, id="No PMI 2"),
         pytest.param(False, "cray pmi2.so\ncray pmi.so\npni.so", 0, id="No CXI"),
@@ -346,10 +344,10 @@ def mock_util_check(util: str) -> str:
         pytest.param(False, "cray pmi.so\npmi2.so\ncxi", 0, id="Non Cray PMI2"),
     ],
 )
-def test_is_cray_ex(
-    monkeypatch: pytest.MonkeyPatch, is_cray: bool, output: str, return_code: int
+def test_is_hsn(
+    monkeypatch: pytest.MonkeyPatch, is_hsn: bool, output: str, return_code: int
 ) -> None:
-    """Test that cray ex platform check result is returned as expected"""
+    """Test that HSN platform check result is returned as expected"""
 
     def mock_util_check(util: str) -> bool:
         # mock that we have the necessary tools
@@ -370,7 +368,7 @@ def mock_util_check(util: str) -> bool:
         )
 
         platform_result = helpers.is_crayex_platform()
-        assert is_cray == platform_result
+        assert is_hsn == platform_result
 
 
 def test_install_package_no_wheel(extraction_dir: pathlib.Path):

From 98e43c1530383ff41c5bc3699eb9940d7b066bdd Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Fri, 25 Oct 2024 11:06:15 -0700
Subject: [PATCH 06/16] CrayEx -> HSN in library

---
 smartsim/_core/_cli/scripts/dragon_install.py |  6 ++---
 smartsim/_core/utils/__init__.py              |  2 +-
 smartsim/_core/utils/helpers.py               | 27 ++++++++++---------
 tests/test_dragon_installer.py                |  8 +++---
 4 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index b3b1f6f982..c67d9f767b 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -9,7 +9,7 @@
 from smartsim._core._cli.utils import pip
 from smartsim._core._install.utils import retrieve
 from smartsim._core.config import CONFIG
-from smartsim._core.utils.helpers import check_platform, is_crayex_platform
+from smartsim._core.utils.helpers import check_platform, is_hsn_platform
 from smartsim.error.errors import SmartSimCLIActionCancelled
 from smartsim.log import get_logger
 
@@ -62,7 +62,7 @@ def _platform_filter(asset_name: str) -> bool:
     :returns: True if supplied value is correct for current platform"""
     key = "hsn"
     is_cray = key in asset_name.lower()
-    if is_crayex_platform():
+    if is_hsn_platform():
         return is_cray
     return not is_cray
 
@@ -137,7 +137,7 @@ def retrieve_asset_info() -> GitReleaseAsset:
     asset = filter_assets(assets)
 
     platform_result = check_platform()
-    if not platform_result.is_cray:
+    if not platform_result.is_hsn:
         logger.warning("Installing Dragon without HSTA support")
         for msg in platform_result.failures:
             logger.warning(msg)
diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py
index cddbc4ce98..bd163854e5 100644
--- a/smartsim/_core/utils/__init__.py
+++ b/smartsim/_core/utils/__init__.py
@@ -31,6 +31,6 @@
     execute_platform_cmd,
     expand_exe_path,
     installed_redisai_backends,
-    is_crayex_platform,
+    is_hsn_platform,
 )
 from .redis import check_cluster_status, create_cluster, db_is_active
diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py
index b17be763b4..185d44bdb0 100644
--- a/smartsim/_core/utils/helpers.py
+++ b/smartsim/_core/utils/helpers.py
@@ -250,7 +250,6 @@ def installed_redisai_backends(
     :param backends_path: path containing backends
     :return: list of installed RedisAI backends
     """
-    # import here to avoid circular import
     base_path = redis_install_base(backends_path)
     backends: t.Set[_TRedisAIBackendStr] = {
         "tensorflow",
@@ -318,7 +317,7 @@ def execute_platform_cmd(cmd: str) -> t.Tuple[str, int]:
     return process.stdout.decode("utf-8"), process.returncode
 
 
-class CrayExPlatformResult:
+class HSNPlatformResult:
     locate_msg = "Unable to locate `{0}`."
 
     def __init__(self, ldconfig: t.Optional[str], fi_info: t.Optional[str]) -> None:
@@ -337,7 +336,7 @@ def has_fi_info(self) -> bool:
         return bool(self.fi_info)
 
     @property
-    def is_cray(self) -> bool:
+    def is_hsn(self) -> bool:
         return all(
             (
                 self.has_ldconfig,
@@ -370,11 +369,12 @@ def failures(self) -> t.List[str]:
         return failure_messages
 
 
-def check_platform() -> CrayExPlatformResult:
-    """Returns True if the current platform is identified as Cray EX and
-    HSTA-aware dragon package can be installed, False otherwise.
+def check_platform() -> HSNPlatformResult:
+    """Queries the platform for system libraries to determine if the platform
+    has a compatible high speed network and an HSTA-aware dragon package can be
+    utilized.
 
-    :returns: True if current platform is Cray EX, False otherwise"""
+    :returns: A populated platform result"""
 
     # ldconfig -p | grep cray | grep pmi.so &&
     # ldconfig -p | grep cray | grep pmi2.so &&
@@ -383,7 +383,7 @@ def check_platform() -> CrayExPlatformResult:
     ldconfig = check_for_utility("ldconfig")
     fi_info = check_for_utility("fi_info")
 
-    result = CrayExPlatformResult(ldconfig, fi_info)
+    result = HSNPlatformResult(ldconfig, fi_info)
     if not all((result.has_ldconfig, result.has_fi_info)):
         return result
 
@@ -403,13 +403,14 @@ def check_platform() -> CrayExPlatformResult:
     return result
 
 
-def is_crayex_platform() -> bool:
-    """Returns True if the current platform is identified as Cray EX and
-    HSTA-aware dragon package can be installed, False otherwise.
+def is_hsn_platform() -> bool:
+    """Returns True if the current platform is identified as having a high
+    speed network and HSTA-aware dragon package can be installed, False
+    otherwise.
 
-    :returns: True if current platform is Cray EX, False otherwise"""
+    :returns: True if current platform is HSN compatible, False otherwise"""
     result = check_platform()
-    return result.is_cray
+    return result.is_hsn
 
 
 @t.final
diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py
index 33facb9560..e1bbc00448 100644
--- a/tests/test_dragon_installer.py
+++ b/tests/test_dragon_installer.py
@@ -247,7 +247,7 @@ def test_retrieve_asset_info(
         )
         ctx.setattr(
             smartsim._core._cli.scripts.dragon_install,
-            "is_crayex_platform",
+            "is_hsn_platform",
             lambda: is_hsn,
         )
         ctx.setattr(
@@ -310,7 +310,7 @@ def mock_util_check(util: str) -> str:
             mock_util_check,
         )
 
-        assert not helpers.is_crayex_platform()
+        assert not helpers.is_hsn_platform()
 
 
 def test_is_hsn_missing_fi_info(monkeypatch: pytest.MonkeyPatch) -> None:
@@ -330,7 +330,7 @@ def mock_util_check(util: str) -> str:
             mock_util_check,
         )
 
-        assert not helpers.is_crayex_platform()
+        assert not helpers.is_hsn_platform()
 
 
 @pytest.mark.parametrize(
@@ -367,7 +367,7 @@ def mock_util_check(util: str) -> bool:
             lambda x: (output, return_code),
         )
 
-        platform_result = helpers.is_crayex_platform()
+        platform_result = helpers.is_hsn_platform()
         assert is_hsn == platform_result
 
 

From 0a0cccc0ffc67d4f83dff0dec96791ff501dc42d Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Mon, 28 Oct 2024 10:25:43 -0700
Subject: [PATCH 07/16] Docstrs

---
 .../_core/launcher/dragon/dragonBackend.py     | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 078aa16c4a..37c6037b8a 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -84,24 +84,38 @@ def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.List[int]]:
 
     @property
     def puids(self) -> t.List[int]:
-        """List of Process UIDS belonging to the ProcessGroup"""
+        """List of Process IDs belonging to the ProcessGroup.
+
+        :returns: List of Process IDs belonging to the ProcessGroup.
+        """
         return list(set(itertools.chain(self.active_puids, self.inactive_puids)))
 
     @property
     def active_puids(self) -> t.List[int]:
+        """List of process IDs that are running.
+
+        :returns: List of process IDs that are running.
+        """
         if self.process_group is None:
             return []
         return list(self.process_group.puids)
 
     @property
     def inactive_puids(self) -> t.List[int]:
+        """List of process IDs that have completed.
+
+        :returns: List of process IDs that have completed.
+        """
         if self.process_group is None:
             return []
         return [puid for puid, _ in self.process_group.inactive_puids]
 
     @property
     def return_codes(self) -> t.List[int]:
-        """List of return codes of completed processes"""
+        """List of return codes of completed processes.
+
+        :returns: List of return codes of completed processes.
+        """
         if self.process_group is None:
             return [-1]
         if self.status == SmartSimStatus.STATUS_CANCELLED:

From 599b1f971b7dc94b320d15d2bae0d071adfcaeb6 Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Mon, 28 Oct 2024 10:32:30 -0700
Subject: [PATCH 08/16] Special return codes enshrined as constants

---
 smartsim/_core/launcher/dragon/dragonBackend.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 37c6037b8a..8eee8b345c 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -66,6 +66,10 @@
 logger = get_logger(__name__)
 
 
+_RETURN_CODES_NO_PROCESS_GROUP: t.Final = [-1]
+_RETURN_CODES_PROCESS_GROUP_CANCELLED: t.Final = [-9]
+
+
 @dataclass
 class ProcessGroupInfo:
     status: SmartSimStatus
@@ -117,9 +121,9 @@ def return_codes(self) -> t.List[int]:
         :returns: List of return codes of completed processes.
         """
         if self.process_group is None:
-            return [-1]
+            return _RETURN_CODES_NO_PROCESS_GROUP
         if self.status == SmartSimStatus.STATUS_CANCELLED:
-            return [-9]
+            return _RETURN_CODES_PROCESS_GROUP_CANCELLED
         return [ret for _, ret in self.process_group.inactive_puids]
 
     def __str__(self) -> str:

From a2d9d6bb4d46b5d5209af45ace5a4ed2bc8119d6 Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Mon, 28 Oct 2024 10:34:19 -0700
Subject: [PATCH 09/16] Release process group to match resource util behavior

---
 smartsim/_core/launcher/dragon/dragonBackend.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 8eee8b345c..f286eb56e0 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -80,6 +80,10 @@ class ProcessGroupInfo:
     """List of hosts on which the Process Group """
     redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None
     """Workers used to redirect stdout and stderr to file"""
+    _final_return_codes: t.Optional[t.List[int]] = field(default=None, init=False)
+    """Field to cache final statuses when a process group info is marked as
+    completed so that the underlying process group can be released.
+    """
 
     @property
     def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.List[int]]:
@@ -120,6 +124,8 @@ def return_codes(self) -> t.List[int]:
 
         :returns: List of return codes of completed processes.
         """
+        if self._final_return_codes is not None:
+            return self._final_return_codes
         if self.process_group is None:
             return _RETURN_CODES_NO_PROCESS_GROUP
         if self.status == SmartSimStatus.STATUS_CANCELLED:
@@ -141,6 +147,14 @@ def __str__(self) -> str:
 
         return ", ".join(msg)
 
+    def mark_complete(self) -> None:
+        """Cached the final return codes and release any underlying dragon
+        process groups.
+        """
+        self._final_return_codes = self.return_codes
+        self.process_group = None
+        self.redir_workers = None
+
 
 # Thanks to Colin Wahl from HPE HPC Dragon Team
 def redir_worker(io_conn: dragon_connection.Connection, file_path: str) -> None:
@@ -624,8 +638,7 @@ def _refresh_statuses(self) -> None:
                         except KeyError:
                             logger.error(f"Tried to free a non-allocated host: {host}")
                         self._free_hosts.append(host)
-                    # group_info.process_group = None
-                    group_info.redir_workers = None
+                    group_info.mark_complete()
 
     def _update_shutdown_status(self) -> None:
         self._heartbeat()

From 3a873f56b438529d6aa05fca3fc4052a48929705 Mon Sep 17 00:00:00 2001
From: Matt Drozt <matthew.drozt@gmail.com>
Date: Mon, 28 Oct 2024 10:46:40 -0700
Subject: [PATCH 10/16] Typo

---
 smartsim/_core/launcher/dragon/dragonBackend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index f286eb56e0..32c4d1c039 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -148,7 +148,7 @@ def __str__(self) -> str:
         return ", ".join(msg)
 
     def mark_complete(self) -> None:
-        """Cached the final return codes and release any underlying dragon
+        """Cache the final return codes and release any underlying dragon
         process groups.
         """
         self._final_return_codes = self.return_codes

From c8f6decf36f9c1a148c1ae8739e67abcc82e7d5c Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Wed, 30 Oct 2024 16:50:50 -0500
Subject: [PATCH 11/16] Prevent dragon processes from hanging

Co-authored-by: Al Rigazzi <al.rigazzi@hpe.com>
---
 smartsim/_core/launcher/dragon/dragonBackend.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 32c4d1c039..9fea9ddfc4 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -152,7 +152,11 @@ def mark_complete(self) -> None:
         process groups.
         """
         self._final_return_codes = self.return_codes
+        self.process_group.join()
+        self.process_group.close()
         self.process_group = None
+        self.redir_workers.join()
+        self.redir_workers.close()
         self.redir_workers = None
 
 

From f40dc1d93e5c5d29a8d9b2118119be54c198183f Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Wed, 30 Oct 2024 17:18:05 -0500
Subject: [PATCH 12/16] Re-order for type check

---
 smartsim/_core/launcher/dragon/dragonBackend.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 9fea9ddfc4..5300f6d3a5 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -151,13 +151,15 @@ def mark_complete(self) -> None:
         """Cache the final return codes and release any underlying dragon
         process groups.
         """
-        self._final_return_codes = self.return_codes
-        self.process_group.join()
-        self.process_group.close()
-        self.process_group = None
-        self.redir_workers.join()
-        self.redir_workers.close()
-        self.redir_workers = None
+        if self.process_group is not None:
+            self.process_group.join()
+            self._final_return_codes = self.return_codes
+            self.process_group.close()
+            self.process_group = None
+        if self.redir_workers is not None:
+            self.redir_workers.join()
+            self.redir_workers.close()
+            self.redir_workers = None
 
 
 # Thanks to Colin Wahl from HPE HPC Dragon Team

From e557228cc60664019c06d83e78570ba84adf1fd8 Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Wed, 30 Oct 2024 17:18:29 -0500
Subject: [PATCH 13/16] Use system status codes when cancelled

---
 smartsim/_core/launcher/dragon/dragonBackend.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 5300f6d3a5..5a7a28dd86 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -67,7 +67,6 @@
 
 
 _RETURN_CODES_NO_PROCESS_GROUP: t.Final = [-1]
-_RETURN_CODES_PROCESS_GROUP_CANCELLED: t.Final = [-9]
 
 
 @dataclass
@@ -128,8 +127,6 @@ def return_codes(self) -> t.List[int]:
             return self._final_return_codes
         if self.process_group is None:
             return _RETURN_CODES_NO_PROCESS_GROUP
-        if self.status == SmartSimStatus.STATUS_CANCELLED:
-            return _RETURN_CODES_PROCESS_GROUP_CANCELLED
         return [ret for _, ret in self.process_group.inactive_puids]
 
     def __str__(self) -> str:

From 60d8e73b7d0104d49dd40340f6faeda71d682cb5 Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Wed, 30 Oct 2024 17:14:18 -0500
Subject: [PATCH 14/16] Remove `CRAYEX-` from mock assets

---
 tests/test_dragon_installer.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py
index e1bbc00448..9fa06be508 100644
--- a/tests/test_dragon_installer.py
+++ b/tests/test_dragon_installer.py
@@ -103,13 +103,8 @@ def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset]
     assets: t.List[GitReleaseAsset] = []
     mock_archive_name_tpl = "{}-{}.4.1-{}ac132fe95.tar.gz"
 
-    for python_version, dragon_version, platform in itertools.chain(
-        itertools.product(
-            ["py3.9", "py3.10", "py3.11"], ["dragon-0.8", "dragon-0.9"], ["", "CRAYEX-"]
-        ),
-        itertools.product(
-            ["py3.9", "py3.10", "py3.11"], ["dragon-0.10", "dragon-0.11"], ["", "HSN-"]
-        ),
+    for python_version, dragon_version, platform in itertools.product(
+        ["py3.9", "py3.10", "py3.11"], ["dragon-0.10", "dragon-0.11"], ["", "HSN-"]
     ):
         asset = GitReleaseAsset(requester, headers, attributes, completed)
         archive_name = mock_archive_name_tpl.format(
@@ -192,9 +187,9 @@ def test_retrieve_cached(
     [
         # Dragon V0.8
         pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8"),
-        pytest.param("0.8", "py3.9", True, False, id="0.8,python 3.9"),
-        pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10"),
-        pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11"),
+        pytest.param("0.8", "py3.9", False, False, id="0.8,python 3.9"),
+        pytest.param("0.8", "py3.10", False, False, id="0.8,python 3.10"),
+        pytest.param("0.8", "py3.11", False, False, id="0.8,python 3.11"),
         pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12"),
         pytest.param("0.8", "py3.8", False, True, id="0.8,python 3.8,HSN"),
         pytest.param("0.8", "py3.9", False, True, id="0.8,python 3.9,HSN"),
@@ -203,9 +198,9 @@ def test_retrieve_cached(
         pytest.param("0.8", "py3.12", False, True, id="0.8,python 3.12,HSN"),
         # Dragon V0.9
         pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8"),
-        pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9"),
-        pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10"),
-        pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11"),
+        pytest.param("0.9", "py3.9", False, False, id="0.9,python 3.9"),
+        pytest.param("0.9", "py3.10", False, False, id="0.9,python 3.10"),
+        pytest.param("0.9", "py3.11", False, False, id="0.9,python 3.11"),
         pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12"),
         pytest.param("0.9", "py3.8", False, True, id="0.9,python 3.8,HSN"),
         pytest.param("0.9", "py3.9", False, True, id="0.9,python 3.9,HSN"),

From 216b2dfd068608e33e7283f3669afcd40eb7e6eb Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Fri, 1 Nov 2024 15:23:00 -0500
Subject: [PATCH 15/16] Do not raise and stop backend on nonzero exit

---
 smartsim/_core/launcher/dragon/dragonBackend.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 5a7a28dd86..971c60a6f2 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -409,7 +409,7 @@ def _create_redirect_workers(
         err_file: t.Optional[str],
     ) -> dragon_process_group.ProcessGroup:
         grp_redir = dragon_process_group.ProcessGroup(
-            restart=False, policy=global_policy, pmi_enabled=False
+            restart=False, ignore_error_on_exit=True, policy=global_policy, pmi_enabled=False
         )
         for pol, puid in zip(policies, puids):
             proc = dragon_process.Process(None, ident=puid)
@@ -528,7 +528,7 @@ def _start_steps(self) -> None:
                     host_name=hosts[0],
                 )
                 grp = dragon_process_group.ProcessGroup(
-                    restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy
+                    restart=False, ignore_error_on_exit=True, pmi_enabled=request.pmi_enabled, policy=global_policy
                 )
 
                 policies = []

From eb7639b3b89a01b50f569fe892eaea00a729267b Mon Sep 17 00:00:00 2001
From: Matt Drozt <drozt@hpe.com>
Date: Fri, 1 Nov 2024 15:44:15 -0500
Subject: [PATCH 16/16] Style

---
 smartsim/_core/launcher/dragon/dragonBackend.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 971c60a6f2..da59d9f24a 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -409,7 +409,10 @@ def _create_redirect_workers(
         err_file: t.Optional[str],
     ) -> dragon_process_group.ProcessGroup:
         grp_redir = dragon_process_group.ProcessGroup(
-            restart=False, ignore_error_on_exit=True, policy=global_policy, pmi_enabled=False
+            restart=False,
+            ignore_error_on_exit=True,
+            policy=global_policy,
+            pmi_enabled=False,
         )
         for pol, puid in zip(policies, puids):
             proc = dragon_process.Process(None, ident=puid)
@@ -528,7 +531,10 @@ def _start_steps(self) -> None:
                     host_name=hosts[0],
                 )
                 grp = dragon_process_group.ProcessGroup(
-                    restart=False, ignore_error_on_exit=True, pmi_enabled=request.pmi_enabled, policy=global_policy
+                    restart=False,
+                    ignore_error_on_exit=True,
+                    pmi_enabled=request.pmi_enabled,
+                    policy=global_policy,
                 )
 
                 policies = []