From 33ee012f949c05013b76e1fae965865290afe868 Mon Sep 17 00:00:00 2001 From: amandarichardsonn <30413257+amandarichardsonn@users.noreply.github.com> Date: Mon, 11 Mar 2024 11:04:00 -0700 Subject: [PATCH] Change Status Module (#509) Promote SmartSim statuses to a dedicated type named SmartSimStatus. [ reviewed by @MattToast @al-rigazzi ] [ committed by @amandarichardsonn ] --- smartsim/_core/control/controller.py | 20 +- smartsim/_core/control/job.py | 22 ++- smartsim/_core/control/jobmanager.py | 9 +- .../_core/entrypoints/telemetrymonitor.py | 4 +- smartsim/_core/launcher/lsf/lsfLauncher.py | 8 +- smartsim/_core/launcher/pbs/pbsLauncher.py | 8 +- .../_core/launcher/slurm/slurmLauncher.py | 6 +- smartsim/_core/launcher/stepInfo.py | 177 ++++++++++-------- smartsim/experiment.py | 5 +- smartsim/status.py | 36 ++-- tests/backends/test_dataloader.py | 4 +- tests/backends/test_dbmodel.py | 17 +- tests/backends/test_dbscript.py | 13 +- tests/backends/test_onnx.py | 4 +- tests/backends/test_tf.py | 4 +- tests/backends/test_torch.py | 4 +- tests/full_wlm/test_generic_batch_launch.py | 9 +- .../full_wlm/test_generic_orc_launch_batch.py | 19 +- tests/full_wlm/test_mpmd.py | 7 +- tests/on_wlm/test_base_settings_on_wlm.py | 7 +- tests/on_wlm/test_colocated_model.py | 17 +- tests/on_wlm/test_containers_wlm.py | 5 +- tests/on_wlm/test_generic_orc_launch.py | 15 +- tests/on_wlm/test_launch_errors.py | 5 +- tests/on_wlm/test_launch_ompi_lsf.py | 5 +- tests/on_wlm/test_local_step.py | 2 +- tests/on_wlm/test_restart.py | 7 +- .../test_simple_base_settings_on_wlm.py | 7 +- tests/on_wlm/test_simple_entity_launch.py | 11 +- tests/on_wlm/test_stop.py | 7 +- tests/test_colo_model_local.py | 19 +- tests/test_containers.py | 9 +- tests/test_experiment.py | 4 +- tests/test_launch_errors.py | 5 +- tests/test_local_launch.py | 7 +- tests/test_local_multi_run.py | 7 +- tests/test_local_restart.py | 11 +- tests/test_multidb.py | 7 +- tests/test_reconnect_orchestrator.py | 7 +- tests/test_smartredis.py | 7 +- tests/test_step_info.py | 8 +- tests/test_telemetry_monitor.py | 73 +++++--- 42 files changed, 346 insertions(+), 282 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 774be1982..bffa76240 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -61,7 +61,7 @@ ) from ...log import get_logger from ...servertype import CLUSTERED, STANDALONE -from ...status import STATUS_CANCELLED, STATUS_RUNNING, TERMINAL_STATUSES +from ...status import TERMINAL_STATUSES, SmartSimStatus from ..config import CONFIG from ..launcher import LocalLauncher, LSFLauncher, PBSLauncher, SlurmLauncher from ..launcher.launcher import Launcher @@ -243,7 +243,13 @@ def stop_db(self, db: Orchestrator) -> None: continue job = self._jobs[node.name] - job.set_status(STATUS_CANCELLED, "", 0, output=None, error=None) + job.set_status( + SmartSimStatus.STATUS_CANCELLED, + "", + 0, + output=None, + error=None, + ) self._jobs.move_to_completed(job) db.reset_hosts() @@ -271,14 +277,14 @@ def get_jobs(self) -> t.Dict[str, Job]: def get_entity_status( self, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] - ) -> str: + ) -> SmartSimStatus: """Get the status of an entity :param entity: entity to get status of :type entity: SmartSimEntity | EntitySequence :raises TypeError: if not SmartSimEntity | EntitySequence :return: status of entity - :rtype: str + :rtype: SmartSimStatus """ if not isinstance(entity, (SmartSimEntity, EntitySequence)): raise TypeError( @@ -289,14 +295,14 @@ def get_entity_status( def get_entity_list_status( self, entity_list: EntitySequence[SmartSimEntity] - ) -> t.List[str]: + ) -> t.List[SmartSimStatus]: """Get the statuses of an entity list :param entity_list: entity list containing entities to get statuses of :type entity_list: EntitySequence :raises TypeError: if not EntitySequence - :return: list of str statuses + :return: list of SmartSimStatus statuses :rtype: list """ if not isinstance(entity_list, EntitySequence): @@ -726,7 +732,7 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: # _jobs.get_status acquires JM lock for main thread, no need for locking statuses = self.get_entity_list_status(orchestrator) - if all(stat == STATUS_RUNNING for stat in statuses): + if all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses): ready = True # TODO remove in favor of by node status check time.sleep(CONFIG.jm_interval) diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index f3bd8cf3a..6064588ea 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -29,7 +29,7 @@ from dataclasses import dataclass from ...entity import EntitySequence, SmartSimEntity -from ...status import STATUS_NEW +from ...status import SmartSimStatus @dataclass(frozen=True) @@ -96,7 +96,7 @@ def __init__( self.name = job_name self.jid = job_id self.entity = entity - self.status = STATUS_NEW + self.status = SmartSimStatus.STATUS_NEW # status before smartsim status mapping is applied self.raw_status: t.Optional[str] = None self.returncode: t.Optional[int] = None @@ -116,7 +116,7 @@ def ename(self) -> str: def set_status( self, - new_status: str, + new_status: SmartSimStatus, raw_status: str, returncode: t.Optional[int], error: t.Optional[str] = None, @@ -125,9 +125,15 @@ def set_status( """Set the status of a job. :param new_status: The new status of the job - :type new_status: str + :type new_status: SmartSimStatus + :param raw_status: The raw status of the launcher + :type raw_status: str :param returncode: The return code for the job - :type return_code: str + :type return_code: int + :param error: Content produced by stderr + :type error: str + :param output: Content produced by stdout + :type output: str """ self.status = new_status self.raw_status = raw_status @@ -157,7 +163,7 @@ def reset( """ self.name = new_job_name self.jid = new_job_id - self.status = STATUS_NEW + self.status = SmartSimStatus.STATUS_NEW self.returncode = None self.output = None self.error = None @@ -213,14 +219,14 @@ def __init__(self, runs: int = 0) -> None: """ self.runs = runs self.jids: t.Dict[int, t.Optional[str]] = {} - self.statuses: t.Dict[int, str] = {} + self.statuses: t.Dict[int, SmartSimStatus] = {} self.returns: t.Dict[int, t.Optional[int]] = {} self.job_times: t.Dict[int, float] = {} def record( self, job_id: t.Optional[str], - status: str, + status: SmartSimStatus, returncode: t.Optional[int], job_time: float, ) -> None: diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index e482b9951..89363d520 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -35,7 +35,7 @@ from ...database import Orchestrator from ...entity import DBNode, EntitySequence, SmartSimEntity from ...log import ContextThread, get_logger -from ...status import STATUS_NEVER_STARTED, TERMINAL_STATUSES +from ...status import TERMINAL_STATUSES, SmartSimStatus from ..config import CONFIG from ..launcher import Launcher, LocalLauncher from ..utils.network import get_ip_from_host @@ -239,12 +239,13 @@ def check_jobs(self) -> None: def get_status( self, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], - ) -> str: + ) -> SmartSimStatus: """Return the status of a job. :param entity: SmartSimEntity or EntitySequence instance :type entity: SmartSimEntity | EntitySequence - :returns: tuple of status + :returns: a SmartSimStatus status + :rtype: SmartSimStatus """ with self._lock: if entity.name in self.completed: @@ -254,7 +255,7 @@ def get_status( job: Job = self[entity.name] # locked return job.status - return STATUS_NEVER_STARTED + return SmartSimStatus.STATUS_NEVER_STARTED def set_launcher(self, launcher: Launcher) -> None: """Set the launcher of the job manager to a specific launcher instance diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py index 115528bf4..b5924fccd 100644 --- a/smartsim/_core/entrypoints/telemetrymonitor.py +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -58,7 +58,7 @@ from smartsim._core.utils.helpers import get_ts from smartsim._core.utils.serialize import MANIFEST_FILENAME from smartsim.error.errors import SmartSimError -from smartsim.status import STATUS_COMPLETED, TERMINAL_STATUSES +from smartsim.status import TERMINAL_STATUSES, SmartSimStatus """Telemetry Monitor entrypoint""" @@ -286,7 +286,7 @@ def faux_return_code(step_info: StepInfo) -> t.Optional[int]: if step_info.status not in TERMINAL_STATUSES: return None - if step_info.status == STATUS_COMPLETED: + if step_info.status == SmartSimStatus.STATUS_COMPLETED: return os.EX_OK return 1 diff --git a/smartsim/_core/launcher/lsf/lsfLauncher.py b/smartsim/_core/launcher/lsf/lsfLauncher.py index a8b6fafdb..bfa560c2d 100644 --- a/smartsim/_core/launcher/lsf/lsfLauncher.py +++ b/smartsim/_core/launcher/lsf/lsfLauncher.py @@ -38,7 +38,7 @@ RunSettings, SettingsBase, ) -from ....status import STATUS_CANCELLED, STATUS_COMPLETED +from ....status import SmartSimStatus from ...config import CONFIG from ..launcher import WLMLauncher from ..step import ( @@ -155,7 +155,9 @@ def stop(self, step_name: str) -> StepInfo: if not step_info: raise LauncherError(f"Could not get step_info for job step {step_name}") - step_info.status = STATUS_CANCELLED # set status to cancelled instead of failed + step_info.status = ( + SmartSimStatus.STATUS_CANCELLED + ) # set status to cancelled instead of failed return step_info @staticmethod @@ -207,7 +209,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: # create LSFBatchStepInfo objects to return batch_info = LSFBatchStepInfo(stat, None) # account for case where job history is not logged by LSF - if batch_info.status == STATUS_COMPLETED: + if batch_info.status == SmartSimStatus.STATUS_COMPLETED: batch_info.returncode = 0 updates.append(batch_info) return updates diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index 0b2f85e95..bb1b46d46 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -39,7 +39,7 @@ RunSettings, SettingsBase, ) -from ....status import STATUS_CANCELLED, STATUS_COMPLETED +from ....status import SmartSimStatus from ...config import CONFIG from ..launcher import WLMLauncher from ..step import ( @@ -149,7 +149,9 @@ def stop(self, step_name: str) -> StepInfo: if not step_info: raise LauncherError(f"Could not get step_info for job step {step_name}") - step_info.status = STATUS_CANCELLED # set status to cancelled instead of failed + step_info.status = ( + SmartSimStatus.STATUS_CANCELLED + ) # set status to cancelled instead of failed return step_info @staticmethod @@ -191,7 +193,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: for stat, _ in zip(stats, step_ids): info = PBSStepInfo(stat, None) # account for case where job history is not logged by PBS - if info.status == STATUS_COMPLETED: + if info.status == SmartSimStatus.STATUS_COMPLETED: info.returncode = 0 updates.append(info) diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index e939a63db..a25e62806 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -40,7 +40,7 @@ SettingsBase, SrunSettings, ) -from ....status import STATUS_CANCELLED +from ....status import SmartSimStatus from ...config import CONFIG from ..launcher import WLMLauncher from ..step import ( @@ -218,7 +218,9 @@ def stop(self, step_name: str) -> StepInfo: if not step_info: raise LauncherError(f"Could not get step_info for job step {step_name}") - step_info.status = STATUS_CANCELLED # set status to cancelled instead of failed + step_info.status = ( + SmartSimStatus.STATUS_CANCELLED + ) # set status to cancelled instead of failed return step_info @staticmethod diff --git a/smartsim/_core/launcher/stepInfo.py b/smartsim/_core/launcher/stepInfo.py index 56b5218fc..875eb0322 100644 --- a/smartsim/_core/launcher/stepInfo.py +++ b/smartsim/_core/launcher/stepInfo.py @@ -28,20 +28,13 @@ import psutil -from ...status import ( - SMARTSIM_STATUS, - STATUS_CANCELLED, - STATUS_COMPLETED, - STATUS_FAILED, - STATUS_PAUSED, - STATUS_RUNNING, -) +from ...status import SmartSimStatus class StepInfo: def __init__( self, - status: str = "", + status: SmartSimStatus, launcher_status: str = "", returncode: t.Optional[int] = None, output: t.Optional[str] = None, @@ -54,48 +47,50 @@ def __init__( self.error = error def __str__(self) -> str: - info_str = f"Status: {self.status}" + info_str = f"Status: {self.status.value}" info_str += f" | Launcher Status {self.launcher_status}" info_str += f" | Returncode {str(self.returncode)}" return info_str @property - def mapping(self) -> t.Dict[str, str]: + def mapping(self) -> t.Dict[str, SmartSimStatus]: raise NotImplementedError def _get_smartsim_status( self, status: str, returncode: t.Optional[int] = None - ) -> str: + ) -> SmartSimStatus: """ Map the status of the WLM step to a smartsim-specific status """ - if status in SMARTSIM_STATUS: - return SMARTSIM_STATUS[status] + if any(ss_status.value == status for ss_status in SmartSimStatus): + return SmartSimStatus(status) if status in self.mapping and returncode in [None, 0]: return self.mapping[status] - return STATUS_FAILED + return SmartSimStatus.STATUS_FAILED class UnmanagedStepInfo(StepInfo): @property - def mapping(self) -> t.Dict[str, str]: + def mapping(self) -> t.Dict[str, SmartSimStatus]: # see https://github.com/giampaolo/psutil/blob/master/psutil/_pslinux.py # see https://github.com/giampaolo/psutil/blob/master/psutil/_common.py return { - psutil.STATUS_RUNNING: STATUS_RUNNING, - psutil.STATUS_SLEEPING: STATUS_RUNNING, # sleeping thread is still alive - psutil.STATUS_WAKING: STATUS_RUNNING, - psutil.STATUS_DISK_SLEEP: STATUS_RUNNING, - psutil.STATUS_DEAD: STATUS_FAILED, - psutil.STATUS_TRACING_STOP: STATUS_PAUSED, - psutil.STATUS_WAITING: STATUS_PAUSED, - psutil.STATUS_STOPPED: STATUS_PAUSED, - psutil.STATUS_LOCKED: STATUS_PAUSED, - psutil.STATUS_PARKED: STATUS_PAUSED, - psutil.STATUS_IDLE: STATUS_PAUSED, - psutil.STATUS_ZOMBIE: STATUS_COMPLETED, + psutil.STATUS_RUNNING: SmartSimStatus.STATUS_RUNNING, + psutil.STATUS_SLEEPING: ( + SmartSimStatus.STATUS_RUNNING + ), # sleeping thread is still alive + psutil.STATUS_WAKING: SmartSimStatus.STATUS_RUNNING, + psutil.STATUS_DISK_SLEEP: SmartSimStatus.STATUS_RUNNING, + psutil.STATUS_DEAD: SmartSimStatus.STATUS_FAILED, + psutil.STATUS_TRACING_STOP: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_WAITING: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_STOPPED: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_LOCKED: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_PARKED: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_IDLE: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_ZOMBIE: SmartSimStatus.STATUS_COMPLETED, } def __init__( @@ -114,30 +109,30 @@ def __init__( class SlurmStepInfo(StepInfo): # cov-slurm # see https://slurm.schedmd.com/squeue.html#lbAG mapping = { - "RUNNING": STATUS_RUNNING, - "CONFIGURING": STATUS_RUNNING, - "STAGE_OUT": STATUS_RUNNING, - "COMPLETED": STATUS_COMPLETED, - "DEADLINE": STATUS_COMPLETED, - "TIMEOUT": STATUS_COMPLETED, - "BOOT_FAIL": STATUS_FAILED, - "FAILED": STATUS_FAILED, - "NODE_FAIL": STATUS_FAILED, - "OUT_OF_MEMORY": STATUS_FAILED, - "CANCELLED": STATUS_CANCELLED, - "CANCELLED+": STATUS_CANCELLED, - "REVOKED": STATUS_CANCELLED, - "PENDING": STATUS_PAUSED, - "PREEMPTED": STATUS_PAUSED, - "RESV_DEL_HOLD": STATUS_PAUSED, - "REQUEUE_FED": STATUS_PAUSED, - "REQUEUE_HOLD": STATUS_PAUSED, - "REQUEUED": STATUS_PAUSED, - "RESIZING": STATUS_PAUSED, - "SIGNALING": STATUS_PAUSED, - "SPECIAL_EXIT": STATUS_PAUSED, - "STOPPED": STATUS_PAUSED, - "SUSPENDED": STATUS_PAUSED, + "RUNNING": SmartSimStatus.STATUS_RUNNING, + "CONFIGURING": SmartSimStatus.STATUS_RUNNING, + "STAGE_OUT": SmartSimStatus.STATUS_RUNNING, + "COMPLETED": SmartSimStatus.STATUS_COMPLETED, + "DEADLINE": SmartSimStatus.STATUS_COMPLETED, + "TIMEOUT": SmartSimStatus.STATUS_COMPLETED, + "BOOT_FAIL": SmartSimStatus.STATUS_FAILED, + "FAILED": SmartSimStatus.STATUS_FAILED, + "NODE_FAIL": SmartSimStatus.STATUS_FAILED, + "OUT_OF_MEMORY": SmartSimStatus.STATUS_FAILED, + "CANCELLED": SmartSimStatus.STATUS_CANCELLED, + "CANCELLED+": SmartSimStatus.STATUS_CANCELLED, + "REVOKED": SmartSimStatus.STATUS_CANCELLED, + "PENDING": SmartSimStatus.STATUS_PAUSED, + "PREEMPTED": SmartSimStatus.STATUS_PAUSED, + "RESV_DEL_HOLD": SmartSimStatus.STATUS_PAUSED, + "REQUEUE_FED": SmartSimStatus.STATUS_PAUSED, + "REQUEUE_HOLD": SmartSimStatus.STATUS_PAUSED, + "REQUEUED": SmartSimStatus.STATUS_PAUSED, + "RESIZING": SmartSimStatus.STATUS_PAUSED, + "SIGNALING": SmartSimStatus.STATUS_PAUSED, + "SPECIAL_EXIT": SmartSimStatus.STATUS_PAUSED, + "STOPPED": SmartSimStatus.STATUS_PAUSED, + "SUSPENDED": SmartSimStatus.STATUS_PAUSED, } def __init__( @@ -155,23 +150,27 @@ def __init__( class PBSStepInfo(StepInfo): # cov-pbs @property - def mapping(self) -> t.Dict[str, str]: + def mapping(self) -> t.Dict[str, SmartSimStatus]: # pylint: disable=line-too-long # see http://nusc.nsu.ru/wiki/lib/exe/fetch.php/doc/pbs/PBSReferenceGuide19.2.1.pdf#M11.9.90788.PBSHeading1.81.Job.States return { - "R": STATUS_RUNNING, - "B": STATUS_RUNNING, - "H": STATUS_PAUSED, - "M": STATUS_PAUSED, # Actually means that it was moved to another server, + "R": SmartSimStatus.STATUS_RUNNING, + "B": SmartSimStatus.STATUS_RUNNING, + "H": SmartSimStatus.STATUS_PAUSED, + "M": ( + SmartSimStatus.STATUS_PAUSED + ), # Actually means that it was moved to another server, # TODO: understand what this implies - "Q": STATUS_PAUSED, - "S": STATUS_PAUSED, - "T": STATUS_PAUSED, # This means in transition, see above for comment - "U": STATUS_PAUSED, - "W": STATUS_PAUSED, - "E": STATUS_COMPLETED, - "F": STATUS_COMPLETED, - "X": STATUS_COMPLETED, + "Q": SmartSimStatus.STATUS_PAUSED, + "S": SmartSimStatus.STATUS_PAUSED, + "T": ( + SmartSimStatus.STATUS_PAUSED + ), # This means in transition, see above for comment + "U": SmartSimStatus.STATUS_PAUSED, + "W": SmartSimStatus.STATUS_PAUSED, + "E": SmartSimStatus.STATUS_COMPLETED, + "F": SmartSimStatus.STATUS_COMPLETED, + "X": SmartSimStatus.STATUS_COMPLETED, } def __init__( @@ -183,10 +182,14 @@ def __init__( ) -> None: if status == "NOTFOUND": if returncode is not None: - smartsim_status = "Completed" if returncode == 0 else "Failed" + smartsim_status = ( + SmartSimStatus.STATUS_COMPLETED + if returncode == 0 + else SmartSimStatus.STATUS_FAILED + ) else: # if PBS job history isnt available, and job isnt in queue - smartsim_status = "Completed" + smartsim_status = SmartSimStatus.STATUS_COMPLETED returncode = 0 else: smartsim_status = self._get_smartsim_status(status) @@ -197,16 +200,16 @@ def __init__( class LSFBatchStepInfo(StepInfo): # cov-lsf @property - def mapping(self) -> t.Dict[str, str]: + def mapping(self) -> t.Dict[str, SmartSimStatus]: # pylint: disable=line-too-long # see https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=execution-about-job-states return { - "RUN": STATUS_RUNNING, - "PSUSP": STATUS_PAUSED, - "USUSP": STATUS_PAUSED, - "SSUSP": STATUS_PAUSED, - "PEND": STATUS_PAUSED, - "DONE": STATUS_COMPLETED, + "RUN": SmartSimStatus.STATUS_RUNNING, + "PSUSP": SmartSimStatus.STATUS_PAUSED, + "USUSP": SmartSimStatus.STATUS_PAUSED, + "SSUSP": SmartSimStatus.STATUS_PAUSED, + "PEND": SmartSimStatus.STATUS_PAUSED, + "DONE": SmartSimStatus.STATUS_COMPLETED, } def __init__( @@ -218,9 +221,13 @@ def __init__( ) -> None: if status == "NOTFOUND": if returncode is not None: - smartsim_status = "Completed" if returncode == 0 else "Failed" + smartsim_status = ( + SmartSimStatus.STATUS_COMPLETED + if returncode == 0 + else SmartSimStatus.STATUS_FAILED + ) else: - smartsim_status = "Completed" + smartsim_status = SmartSimStatus.STATUS_COMPLETED returncode = 0 else: smartsim_status = self._get_smartsim_status(status) @@ -231,14 +238,14 @@ def __init__( class LSFJsrunStepInfo(StepInfo): # cov-lsf @property - def mapping(self) -> t.Dict[str, str]: + def mapping(self) -> t.Dict[str, SmartSimStatus]: # pylint: disable=line-too-long # see https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=execution-about-job-states return { - "Killed": STATUS_COMPLETED, - "Running": STATUS_RUNNING, - "Queued": STATUS_PAUSED, - "Complete": STATUS_COMPLETED, + "Killed": SmartSimStatus.STATUS_COMPLETED, + "Running": SmartSimStatus.STATUS_RUNNING, + "Queued": SmartSimStatus.STATUS_PAUSED, + "Complete": SmartSimStatus.STATUS_COMPLETED, } def __init__( @@ -250,9 +257,13 @@ def __init__( ) -> None: if status == "NOTFOUND": if returncode is not None: - smartsim_status = "Completed" if returncode == 0 else "Failed" + smartsim_status = ( + SmartSimStatus.STATUS_COMPLETED + if returncode == 0 + else SmartSimStatus.STATUS_FAILED + ) else: - smartsim_status = "Completed" + smartsim_status = SmartSimStatus.STATUS_COMPLETED returncode = 0 else: smartsim_status = self._get_smartsim_status(status, returncode) diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 569d11e0a..279128282 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -32,6 +32,7 @@ from tabulate import tabulate from smartsim.error.errors import SSUnsupportedError +from smartsim.status import SmartSimStatus from ._core import Controller, Generator, Manifest from ._core.utils import init_default @@ -368,7 +369,7 @@ def finished(self, entity: SmartSimEntity) -> bool: @_contextualize def get_status( self, *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] - ) -> t.List[str]: + ) -> t.List[SmartSimStatus]: """Query the status of launched instances Return a smartsim.status string representing @@ -396,7 +397,7 @@ def get_status( """ try: manifest = Manifest(*args) - statuses: t.List[str] = [] + statuses: t.List[SmartSimStatus] = [] for entity in manifest.models: statuses.append(self._control.get_entity_status(entity)) for entity_list in manifest.all_entity_lists: diff --git a/smartsim/status.py b/smartsim/status.py index 409ec8c1a..e42ef3191 100644 --- a/smartsim/status.py +++ b/smartsim/status.py @@ -24,27 +24,21 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from enum import Enum -# Statuses that are applied to jobs -STATUS_RUNNING = "Running" -STATUS_COMPLETED = "Completed" -STATUS_CANCELLED = "Cancelled" -STATUS_FAILED = "Failed" -STATUS_NEW = "New" -STATUS_PAUSED = "Paused" -STATUS_NEVER_STARTED = "NeverStarted" -# SmartSim status mapping -SMARTSIM_STATUS = { - "Running": STATUS_RUNNING, - "Paused": STATUS_PAUSED, - "Completed": STATUS_COMPLETED, - "Cancelled": STATUS_CANCELLED, - "Failed": STATUS_FAILED, - "New": STATUS_NEW, - "NeverStarted": STATUS_NEVER_STARTED, -} +class SmartSimStatus(Enum): + STATUS_RUNNING = "Running" + STATUS_COMPLETED = "Completed" + STATUS_CANCELLED = "Cancelled" + STATUS_FAILED = "Failed" + STATUS_NEW = "New" + STATUS_PAUSED = "Paused" + STATUS_NEVER_STARTED = "NeverStarted" + -# Status groupings -TERMINAL_STATUSES = {STATUS_CANCELLED, STATUS_COMPLETED, STATUS_FAILED} -LIVE_STATUSES = {STATUS_RUNNING, STATUS_PAUSED, STATUS_NEW} +TERMINAL_STATUSES = { + SmartSimStatus.STATUS_CANCELLED, + SmartSimStatus.STATUS_COMPLETED, + SmartSimStatus.STATUS_FAILED, +} diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index d02f3f33c..e377f5631 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -35,7 +35,7 @@ from smartsim.experiment import Experiment from smartsim.log import get_logger from smartsim.ml.data import DataInfo, TrainingDataUploader -from smartsim.status import STATUS_COMPLETED +from smartsim.status import SmartSimStatus logger = get_logger(__name__) @@ -289,7 +289,7 @@ def test_torch_dataloaders(fileutils, test_dir, wlmutils): trainer = create_trainer_torch(exp, config_dir, wlmutils) exp.start(trainer, block=True) - assert exp.get_status(trainer)[0] == STATUS_COMPLETED + assert exp.get_status(trainer)[0] == SmartSimStatus.STATUS_COMPLETED except Exception as e: raise e diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 75e9f515d..3c02947e6 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -29,12 +29,13 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends from smartsim.entity import Ensemble from smartsim.entity.dbobject import DBModel from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger +from smartsim.status import SmartSimStatus logger = get_logger(__name__) @@ -218,7 +219,7 @@ def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): exp.start(db, smartsim_model, block=True) statuses = exp.get_status(smartsim_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: exp.stop(db) @@ -285,7 +286,7 @@ def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): exp.start(db, smartsim_model, block=True) statuses = exp.get_status(smartsim_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: exp.stop(db) @@ -386,7 +387,7 @@ def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): exp.start(db, smartsim_ensemble, block=True) statuses = exp.get_status(smartsim_ensemble) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: exp.stop(db) @@ -458,7 +459,7 @@ def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: exp.stop(colo_model) @@ -518,7 +519,7 @@ def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: exp.stop(colo_model) @@ -620,7 +621,7 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: exp.stop(colo_ensemble) @@ -724,7 +725,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: exp.stop(colo_ensemble) diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index 2bffd1da6..9d0b04c8e 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -30,12 +30,13 @@ import pytest from smartredis import * -from smartsim import Experiment, status +from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends from smartsim.entity.dbobject import DBScript from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger from smartsim.settings import MpiexecSettings, MpirunSettings +from smartsim.status import SmartSimStatus logger = get_logger(__name__) @@ -125,7 +126,7 @@ def test_db_script(fileutils, test_dir, wlmutils, mlutils): try: exp.start(db, smartsim_model, block=True) statuses = exp.get_status(smartsim_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: exp.stop(db) @@ -221,7 +222,7 @@ def test_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): try: exp.start(db, ensemble, block=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: exp.stop(db) @@ -288,7 +289,7 @@ def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): try: exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: exp.stop(colo_model) @@ -388,7 +389,7 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): try: exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: exp.stop(colo_ensemble) @@ -486,7 +487,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m try: exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: exp.stop(colo_ensemble) diff --git a/tests/backends/test_onnx.py b/tests/backends/test_onnx.py index 7c0e97e41..7972d1746 100644 --- a/tests/backends/test_onnx.py +++ b/tests/backends/test_onnx.py @@ -31,7 +31,7 @@ from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends -from smartsim.status import STATUS_FAILED +from smartsim.status import SmartSimStatus sklearn_available = True try: @@ -98,4 +98,4 @@ def test_sklearn_onnx(test_dir, mlutils, wlmutils): exp.stop(db) # if model failed, test will fail model_status = exp.get_status(model) - assert model_status[0] != STATUS_FAILED + assert model_status[0] != SmartSimStatus.STATUS_FAILED diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py index af04c89cb..92cd01695 100644 --- a/tests/backends/test_tf.py +++ b/tests/backends/test_tf.py @@ -32,7 +32,7 @@ from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends from smartsim.error import SmartSimError -from smartsim.status import STATUS_FAILED +from smartsim.status import SmartSimStatus tf_available = True try: @@ -87,7 +87,7 @@ def test_keras_model(test_dir, mlutils, wlmutils): exp.stop(db) # if model failed, test will fail model_status = exp.get_status(model)[0] - assert model_status != STATUS_FAILED + assert model_status != SmartSimStatus.STATUS_FAILED def create_tf_model(): diff --git a/tests/backends/test_torch.py b/tests/backends/test_torch.py index 76a989a2e..a36037de4 100644 --- a/tests/backends/test_torch.py +++ b/tests/backends/test_torch.py @@ -31,7 +31,7 @@ from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends -from smartsim.status import STATUS_FAILED +from smartsim.status import SmartSimStatus torch_available = True try: @@ -86,4 +86,4 @@ def test_torch_model_and_script(test_dir, mlutils, wlmutils): exp.stop(db) # if model failed, test will fail model_status = exp.get_status(model)[0] - assert model_status != STATUS_FAILED + assert model_status != SmartSimStatus.STATUS_FAILED diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index c69b1746a..3487ca81c 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -28,8 +28,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.settings import QsubBatchSettings +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -67,7 +68,7 @@ def test_batch_model(fileutils, test_dir, wlmutils): exp.start(model, block=True) statuses = exp.get_status(model) assert len(statuses) == 1 - assert statuses[0] == status.STATUS_COMPLETED + assert statuses[0] == SmartSimStatus.STATUS_COMPLETED def test_batch_ensemble(fileutils, test_dir, wlmutils): @@ -92,7 +93,7 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): @@ -113,4 +114,4 @@ def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index 058aef895..293a2cdd2 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -29,7 +29,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -66,13 +67,13 @@ def test_launch_orc_auto_batch(test_dir, wlmutils): statuses = exp.get_status(orc) # don't use assert so that we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) def test_launch_cluster_orc_batch_single(test_dir, wlmutils): @@ -102,13 +103,13 @@ def test_launch_cluster_orc_batch_single(test_dir, wlmutils): statuses = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): @@ -138,13 +139,13 @@ def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): statuses = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) def test_launch_cluster_orc_reconnect(test_dir, wlmutils): @@ -168,7 +169,7 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): statuses = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False @@ -185,7 +186,7 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): statuses = exp_2.get_status(reloaded_orc) for stat in statuses: - if stat == status.STATUS_FAILED: + if stat == SmartSimStatus.STATUS_FAILED: exp_2.stop(reloaded_orc) assert False diff --git a/tests/full_wlm/test_mpmd.py b/tests/full_wlm/test_mpmd.py index 7f6cc2ea2..96497c760 100644 --- a/tests/full_wlm/test_mpmd.py +++ b/tests/full_wlm/test_mpmd.py @@ -28,8 +28,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim._core.utils.helpers import is_valid_cmd +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -89,8 +90,8 @@ def prune_commands(launcher): mpmd_model = exp.create_model("mmpd", path=test_dir, run_settings=settings) exp.start(mpmd_model, block=True) statuses = exp.get_status(mpmd_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) exp.start(mpmd_model, block=True) statuses = exp.get_status(mpmd_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/on_wlm/test_base_settings_on_wlm.py b/tests/on_wlm/test_base_settings_on_wlm.py index 0b31eedd2..77bebd524 100644 --- a/tests/on_wlm/test_base_settings_on_wlm.py +++ b/tests/on_wlm/test_base_settings_on_wlm.py @@ -28,7 +28,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus """ Test the launch and stop of models and ensembles using base @@ -54,7 +55,7 @@ def test_model_on_wlm(fileutils, test_dir, wlmutils): for _ in range(2): exp.start(M1, M2, block=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_model_stop_on_wlm(fileutils, test_dir, wlmutils): @@ -74,4 +75,4 @@ def test_model_stop_on_wlm(fileutils, test_dir, wlmutils): assert M1.name in exp._control._jobs.completed assert M2.name in exp._control._jobs.completed statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index 8baf74bf4..97a47542d 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -28,8 +28,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.entity import Model +from smartsim.status import SmartSimStatus if sys.platform == "darwin": supported_dbs = ["tcp", "deprecated"] @@ -60,14 +61,14 @@ def test_launch_colocated_model_defaults(fileutils, test_dir, coloutils, db_type exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" # test restarting the colocated model exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" @@ -91,7 +92,7 @@ def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, db_type exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" @@ -114,7 +115,7 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, test_dir, coloutils, db_ty exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" @@ -139,7 +140,7 @@ def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type): exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" @@ -164,7 +165,7 @@ def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type): exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" @@ -189,5 +190,5 @@ def test_colocated_model_pinning_mixed(fileutils, test_dir, coloutils, db_type): exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" diff --git a/tests/on_wlm/test_containers_wlm.py b/tests/on_wlm/test_containers_wlm.py index 8dc4baae0..b6054a78b 100644 --- a/tests/on_wlm/test_containers_wlm.py +++ b/tests/on_wlm/test_containers_wlm.py @@ -28,9 +28,10 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.entity import Ensemble from smartsim.settings.containers import Singularity +from smartsim.status import SmartSimStatus """Test SmartRedis container integration on a supercomputer with a WLM.""" @@ -92,7 +93,7 @@ def test_singularity_wlm_smartredis(fileutils, test_dir, wlmutils): # get and confirm statuses statuses = exp.get_status(ensemble) - if not all([stat == status.STATUS_COMPLETED for stat in statuses]): + if not all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]): exp.stop(orc) assert False # client ensemble failed diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index 6cf1c3918..f31c8a890 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -26,7 +26,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -55,13 +56,13 @@ def test_launch_orc_auto(test_dir, wlmutils): statuses = exp.get_status(orc) # don't use assert so that we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) def test_launch_cluster_orc_single(test_dir, wlmutils): @@ -88,13 +89,13 @@ def test_launch_cluster_orc_single(test_dir, wlmutils): statuses = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) def test_launch_cluster_orc_multi(test_dir, wlmutils): @@ -121,10 +122,10 @@ def test_launch_cluster_orc_multi(test_dir, wlmutils): statuses = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) diff --git a/tests/on_wlm/test_launch_errors.py b/tests/on_wlm/test_launch_errors.py index 905d96f54..2498a5a91 100644 --- a/tests/on_wlm/test_launch_errors.py +++ b/tests/on_wlm/test_launch_errors.py @@ -28,8 +28,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.error import SmartSimError +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -54,7 +55,7 @@ def test_failed_status(fileutils, test_dir, wlmutils): time.sleep(2) stat = exp.get_status(model) assert len(stat) == 1 - assert stat[0] == status.STATUS_FAILED + assert stat[0] == SmartSimStatus.STATUS_FAILED def test_bad_run_command_args(fileutils, test_dir, wlmutils): diff --git a/tests/on_wlm/test_launch_ompi_lsf.py b/tests/on_wlm/test_launch_ompi_lsf.py index ed5de291b..51c82e418 100644 --- a/tests/on_wlm/test_launch_ompi_lsf.py +++ b/tests/on_wlm/test_launch_ompi_lsf.py @@ -26,7 +26,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -49,4 +50,4 @@ def test_launch_openmpi_lsf(fileutils, test_dir, wlmutils): model = exp.create_model("ompi-model", path=test_dir, run_settings=settings) exp.start(model, block=True) statuses = exp.get_status(model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/on_wlm/test_local_step.py b/tests/on_wlm/test_local_step.py index 4e5f45e0b..8f7d823b8 100644 --- a/tests/on_wlm/test_local_step.py +++ b/tests/on_wlm/test_local_step.py @@ -29,7 +29,7 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.settings import RunSettings # retrieved from pytest fixtures diff --git a/tests/on_wlm/test_restart.py b/tests/on_wlm/test_restart.py index 42bbe752c..0116c10d3 100644 --- a/tests/on_wlm/test_restart.py +++ b/tests/on_wlm/test_restart.py @@ -28,7 +28,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -48,10 +49,10 @@ def test_restart(fileutils, test_dir, wlmutils): exp.start(M1, M2, block=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) exp.start(M1, M2, block=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) # TODO add job history check here. diff --git a/tests/on_wlm/test_simple_base_settings_on_wlm.py b/tests/on_wlm/test_simple_base_settings_on_wlm.py index 1611781eb..caa55da3e 100644 --- a/tests/on_wlm/test_simple_base_settings_on_wlm.py +++ b/tests/on_wlm/test_simple_base_settings_on_wlm.py @@ -28,8 +28,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.settings.settings import RunSettings +from smartsim.status import SmartSimStatus """ Test the launch and stop of simple models and ensembles that use base @@ -63,7 +64,7 @@ def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): # launch model twice to show that it can also be restarted for _ in range(2): exp.start(M, block=True) - assert exp.get_status(M)[0] == status.STATUS_COMPLETED + assert exp.get_status(M)[0] == SmartSimStatus.STATUS_COMPLETED def test_simple_model_stop_on_wlm(fileutils, test_dir, wlmutils): @@ -83,4 +84,4 @@ def test_simple_model_stop_on_wlm(fileutils, test_dir, wlmutils): time.sleep(2) exp.stop(M) assert M.name in exp._control._jobs.completed - assert exp.get_status(M)[0] == status.STATUS_CANCELLED + assert exp.get_status(M)[0] == SmartSimStatus.STATUS_CANCELLED diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index 1ecc27442..5dacc13da 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -28,7 +28,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus """ Test the launch of simple entity types on pre-existing allocations. @@ -59,7 +60,7 @@ def test_models(fileutils, test_dir, wlmutils): exp.start(M1, M2, block=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_ensemble(fileutils, test_dir, wlmutils): @@ -75,7 +76,7 @@ def test_ensemble(fileutils, test_dir, wlmutils): exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_summary(fileutils, test_dir, wlmutils): @@ -97,8 +98,8 @@ def test_summary(fileutils, test_dir, wlmutils): # start and poll exp.start(sleep, bad) - assert exp.get_status(bad)[0] == status.STATUS_FAILED - assert exp.get_status(sleep)[0] == status.STATUS_COMPLETED + assert exp.get_status(bad)[0] == SmartSimStatus.STATUS_FAILED + assert exp.get_status(sleep)[0] == SmartSimStatus.STATUS_COMPLETED summary_str = exp.summary(style="plain") print(summary_str) diff --git a/tests/on_wlm/test_stop.py b/tests/on_wlm/test_stop.py index 8d75d9f65..0c06375a4 100644 --- a/tests/on_wlm/test_stop.py +++ b/tests/on_wlm/test_stop.py @@ -28,7 +28,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus """ Test Stopping launched entities. @@ -55,7 +56,7 @@ def test_stop_entity(fileutils, test_dir, wlmutils): time.sleep(5) exp.stop(M1) assert M1.name in exp._control._jobs.completed - assert exp.get_status(M1)[0] == status.STATUS_CANCELLED + assert exp.get_status(M1)[0] == SmartSimStatus.STATUS_CANCELLED def test_stop_entity_list(fileutils, test_dir, wlmutils): @@ -73,5 +74,5 @@ def test_stop_entity_list(fileutils, test_dir, wlmutils): time.sleep(5) exp.stop(ensemble) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) assert all([m.name in exp._control._jobs.completed for m in ensemble]) diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index 138ceb4b7..fe347ee30 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -28,9 +28,10 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.entity import Model from smartsim.error import SSUnsupportedError +from smartsim.status import SmartSimStatus # The tests in this file belong to the slow_tests group pytestmark = pytest.mark.slow_tests @@ -139,13 +140,13 @@ def test_launch_colocated_model_defaults( exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all(stat == status.STATUS_COMPLETED for stat in statuses) + assert all(stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses) # test restarting the colocated model exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses {statuses}" @@ -181,12 +182,12 @@ def test_launch_multiple_colocated_models( exp.generate(*colo_models) exp.start(*colo_models, block=True) statuses = exp.get_status(*colo_models) - assert all(stat == status.STATUS_COMPLETED for stat in statuses) + assert all(stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses) # test restarting the colocated model exp.start(*colo_models, block=True) statuses = exp.get_status(*colo_models) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) @@ -212,7 +213,7 @@ def test_colocated_model_disable_pinning( exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) @@ -245,7 +246,7 @@ def test_colocated_model_pinning_auto_2cpu( exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @@ -272,7 +273,7 @@ def test_colocated_model_pinning_range( exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @@ -299,7 +300,7 @@ def test_colocated_model_pinning_list( exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_colo_uds_verifies_socket_file_name(test_dir, launcher="local"): diff --git a/tests/test_containers.py b/tests/test_containers.py index 21fe50ad4..98fa5e1bb 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -31,10 +31,11 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.database import Orchestrator from smartsim.entity import Ensemble from smartsim.settings.containers import Singularity +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -109,7 +110,7 @@ def test_singularity_basic(fileutils, test_dir): # get and confirm status stat = exp.get_status(model)[0] - assert stat == status.STATUS_COMPLETED + assert stat == SmartSimStatus.STATUS_COMPLETED print(exp.summary()) @@ -136,7 +137,7 @@ def test_singularity_args(fileutils, test_dir): # get and confirm status stat = exp.get_status(model)[0] - assert stat == status.STATUS_COMPLETED + assert stat == SmartSimStatus.STATUS_COMPLETED print(exp.summary()) @@ -185,7 +186,7 @@ def test_singularity_smartredis(test_dir, fileutils, wlmutils): # get and confirm statuses statuses = exp.get_status(ensemble) - if not all([stat == status.STATUS_COMPLETED for stat in statuses]): + if not all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]): exp.stop(orc) assert False # client ensemble failed diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 12b2f1579..1882508c4 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -33,7 +33,7 @@ from smartsim.error import SmartSimError from smartsim.error.errors import SSUnsupportedError from smartsim.settings import RunSettings -from smartsim.status import STATUS_NEVER_STARTED +from smartsim.status import SmartSimStatus # The tests in this file belong to the slow_tests group pytestmark = pytest.mark.slow_tests @@ -88,7 +88,7 @@ def test_status_typeerror(): def test_status_pre_launch(): model = Model("name", {}, "./", RunSettings("python")) exp = Experiment("test") - assert exp.get_status(model)[0] == STATUS_NEVER_STARTED + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_NEVER_STARTED def test_bad_ensemble_init_no_rs(): diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index 7e2c5d9be..4431cd31c 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -27,10 +27,11 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.database import Orchestrator from smartsim.error import SSUnsupportedError from smartsim.settings import JsrunSettings, RunSettings +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -57,7 +58,7 @@ def test_model_failure(fileutils, test_dir): exp.start(M1, block=True) statuses = exp.get_status(M1) - assert all([stat == status.STATUS_FAILED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_FAILED for stat in statuses]) def test_orchestrator_relaunch(test_dir, wlmutils): diff --git a/tests/test_local_launch.py b/tests/test_local_launch.py index 7befff95e..fa09806b3 100644 --- a/tests/test_local_launch.py +++ b/tests/test_local_launch.py @@ -26,7 +26,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -49,7 +50,7 @@ def test_models(fileutils, test_dir): exp.start(M1, M2, block=True, summary=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_ensemble(fileutils, test_dir): @@ -64,4 +65,4 @@ def test_ensemble(fileutils, test_dir): exp.start(ensemble, block=True, summary=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/test_local_multi_run.py b/tests/test_local_multi_run.py index 576e290ca..a2c1d70ee 100644 --- a/tests/test_local_multi_run.py +++ b/tests/test_local_multi_run.py @@ -26,7 +26,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -49,9 +50,9 @@ def test_models(fileutils, test_dir): exp.start(M1, block=False) statuses = exp.get_status(M1) - assert all([stat != status.STATUS_FAILED for stat in statuses]) + assert all([stat != SmartSimStatus.STATUS_FAILED for stat in statuses]) # start another while first model is running exp.start(M2, block=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/test_local_restart.py b/tests/test_local_restart.py index c59aebd7b..383e99900 100644 --- a/tests/test_local_restart.py +++ b/tests/test_local_restart.py @@ -26,7 +26,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b @@ -48,12 +49,12 @@ def test_restart(fileutils, test_dir): exp.start(M1, block=True) statuses = exp.get_status(M1) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) # restart the model exp.start(M1, block=True) statuses = exp.get_status(M1) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_ensemble(fileutils, test_dir): @@ -68,9 +69,9 @@ def test_ensemble(fileutils, test_dir): exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) # restart the ensemble exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/test_multidb.py b/tests/test_multidb.py index af21f5a1e..5a530dc97 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -27,11 +27,12 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.database import Orchestrator from smartsim.entity.entity import SmartSimEntity from smartsim.error.errors import SSDBIDConflictError from smartsim.log import get_logger +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b @@ -51,7 +52,7 @@ def make_entity_context(exp: Experiment, entity: SmartSimEntity): try: yield entity finally: - if exp.get_status(entity)[0] == status.STATUS_RUNNING: + if exp.get_status(entity)[0] == SmartSimStatus.STATUS_RUNNING: exp.stop(entity) @@ -65,7 +66,7 @@ def choose_host(wlmutils, index=0): def check_not_failed(exp, *args): statuses = exp.get_status(*args) - assert all(stat is not status.STATUS_FAILED for stat in statuses) + assert all(stat is not SmartSimStatus.STATUS_FAILED for stat in statuses) @pytest.mark.parametrize("db_type", supported_dbs) diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index 554e42cbd..12d9cfb95 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -29,8 +29,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.database import Orchestrator +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b @@ -54,7 +55,7 @@ def test_local_orchestrator(test_dir, wlmutils): exp.start(orc) statuses = exp.get_status(orc) - assert [stat != status.STATUS_FAILED for stat in statuses] + assert [stat != SmartSimStatus.STATUS_FAILED for stat in statuses] # simulate user shutting down main thread exp._control._jobs.actively_monitoring = False @@ -76,7 +77,7 @@ def test_reconnect_local_orc(test_dir): statuses = exp_2.get_status(reloaded_orc) for stat in statuses: - if stat == status.STATUS_FAILED: + if stat == SmartSimStatus.STATUS_FAILED: exp_2.stop(reloaded_orc) assert False exp_2.stop(reloaded_orc) diff --git a/tests/test_smartredis.py b/tests/test_smartredis.py index 282e708cc..a2aac654b 100644 --- a/tests/test_smartredis.py +++ b/tests/test_smartredis.py @@ -27,10 +27,11 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends from smartsim.database import Orchestrator from smartsim.entity import Ensemble, Model +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b @@ -97,7 +98,7 @@ def test_exchange(fileutils, test_dir, wlmutils): # get and confirm statuses statuses = exp.get_status(ensemble) try: - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: # stop the orchestrator exp.stop(orc) @@ -146,7 +147,7 @@ def test_consumer(fileutils, test_dir, wlmutils): # get and confirm statuses statuses = exp.get_status(ensemble) try: - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: # stop the orchestrator exp.stop(orc) diff --git a/tests/test_step_info.py b/tests/test_step_info.py index ec589ae76..fcccaa9cd 100644 --- a/tests/test_step_info.py +++ b/tests/test_step_info.py @@ -26,8 +26,8 @@ import pytest -from smartsim import status from smartsim._core.launcher.stepInfo import * +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b @@ -35,7 +35,9 @@ def test_str(): step_info = StepInfo( - status=status.STATUS_COMPLETED, launcher_status="COMPLETED", returncode=0 + status=SmartSimStatus.STATUS_COMPLETED, + launcher_status="COMPLETED", + returncode=0, ) expected_output = "Status: Completed | Launcher Status COMPLETED | Returncode 0" @@ -45,4 +47,4 @@ def test_str(): def test_default(): step_info = UnmanagedStepInfo() - assert step_info._get_smartsim_status(None) == status.STATUS_FAILED + assert step_info._get_smartsim_status(None) == SmartSimStatus.STATUS_FAILED diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py index ac3599d7d..823767adb 100644 --- a/tests/test_telemetry_monitor.py +++ b/tests/test_telemetry_monitor.py @@ -57,14 +57,7 @@ from smartsim._core.utils import serialize from smartsim.error.errors import UnproxyableStepError from smartsim.settings.base import RunSettings -from smartsim.status import ( - STATUS_CANCELLED, - STATUS_COMPLETED, - STATUS_FAILED, - STATUS_NEW, - STATUS_PAUSED, - STATUS_RUNNING, -) +from smartsim.status import SmartSimStatus ALL_ARGS = {"-exp_dir", "-frequency"} PROXY_ENTRY_POINT = "smartsim._core.entrypoints.indirect" @@ -455,7 +448,7 @@ def test_telemetry_single_model(fileutils, test_dir, wlmutils, config): smartsim_model = exp.create_model("perroquet", app_settings) exp.generate(smartsim_model) exp.start(smartsim_model, block=True) - assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + assert exp.get_status(smartsim_model)[0] == SmartSimStatus.STATUS_COMPLETED telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir start_events = list(telemetry_output_path.rglob("start.json")) @@ -496,7 +489,7 @@ def test_telemetry_single_model_nonblocking( telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) - assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + assert exp.get_status(smartsim_model)[0] == SmartSimStatus.STATUS_COMPLETED start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -534,7 +527,10 @@ def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch, con exp.generate(*smartsim_models) exp.start(*smartsim_models, block=True) assert all( - [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] + [ + status == SmartSimStatus.STATUS_COMPLETED + for status in exp.get_status(*smartsim_models) + ] ) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir @@ -581,7 +577,10 @@ def test_telemetry_serial_models_nonblocking( snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) assert all( - [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] + [ + status == SmartSimStatus.STATUS_COMPLETED + for status in exp.get_status(*smartsim_models) + ] ) start_events = list(telemetry_output_path.rglob("start.json")) @@ -629,7 +628,7 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config exp.stop(orc) snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) - assert exp.get_status(orc)[0] == STATUS_CANCELLED + assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED stop_events = list(telemetry_output_path.rglob("stop.json")) assert len(stop_events) == 1 @@ -672,7 +671,7 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, con exp.stop(orc) snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) - assert exp.get_status(orc)[0] == STATUS_CANCELLED + assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED stop_events = list(telemetry_output_path.rglob("stop.json")) assert len(stop_events) == 1 @@ -719,8 +718,8 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch, conf telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) - assert exp.get_status(orc)[0] == STATUS_CANCELLED - assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED + assert exp.get_status(smartsim_model)[0] == SmartSimStatus.STATUS_COMPLETED start_events = list(telemetry_output_path.rglob("database/**/start.json")) stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) @@ -759,7 +758,12 @@ def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch, config): ens = exp.create_ensemble("troupeau", run_settings=app_settings, replicas=5) exp.generate(ens) exp.start(ens, block=True) - assert all([status == STATUS_COMPLETED for status in exp.get_status(ens)]) + assert all( + [ + status == SmartSimStatus.STATUS_COMPLETED + for status in exp.get_status(ens) + ] + ) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) @@ -798,7 +802,10 @@ def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch, c exp.generate(smartsim_model) exp.start(smartsim_model, block=True) assert all( - [status == STATUS_COMPLETED for status in exp.get_status(smartsim_model)] + [ + status == SmartSimStatus.STATUS_COMPLETED + for status in exp.get_status(smartsim_model) + ] ) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir @@ -1044,12 +1051,12 @@ def test_multistart_experiment( @pytest.mark.parametrize( "status_in, expected_out", [ - pytest.param(STATUS_CANCELLED, 1, id="failure on cancellation"), - pytest.param(STATUS_COMPLETED, 0, id="success on completion"), - pytest.param(STATUS_FAILED, 1, id="failure on failed"), - pytest.param(STATUS_NEW, None, id="failure on new"), - pytest.param(STATUS_PAUSED, None, id="failure on paused"), - pytest.param(STATUS_RUNNING, None, id="failure on running"), + pytest.param(SmartSimStatus.STATUS_CANCELLED, 1, id="failure on cancellation"), + pytest.param(SmartSimStatus.STATUS_COMPLETED, 0, id="success on completion"), + pytest.param(SmartSimStatus.STATUS_FAILED, 1, id="failure on failed"), + pytest.param(SmartSimStatus.STATUS_NEW, None, id="failure on new"), + pytest.param(SmartSimStatus.STATUS_PAUSED, None, id="failure on paused"), + pytest.param(SmartSimStatus.STATUS_RUNNING, None, id="failure on running"), ], ) def test_faux_rc(status_in: str, expected_out: t.Optional[int]): @@ -1063,12 +1070,18 @@ def test_faux_rc(status_in: str, expected_out: t.Optional[int]): @pytest.mark.parametrize( "status_in, expected_out, expected_has_jobs", [ - pytest.param(STATUS_CANCELLED, 1, False, id="failure on cancellation"), - pytest.param(STATUS_COMPLETED, 0, False, id="success on completion"), - pytest.param(STATUS_FAILED, 1, False, id="failure on failed"), - pytest.param(STATUS_NEW, None, True, id="failure on new"), - pytest.param(STATUS_PAUSED, None, True, id="failure on paused"), - pytest.param(STATUS_RUNNING, None, True, id="failure on running"), + pytest.param( + SmartSimStatus.STATUS_CANCELLED, 1, False, id="failure on cancellation" + ), + pytest.param( + SmartSimStatus.STATUS_COMPLETED, 0, False, id="success on completion" + ), + pytest.param(SmartSimStatus.STATUS_FAILED, 1, False, id="failure on failed"), + pytest.param(SmartSimStatus.STATUS_NEW, None, True, id="failure on new"), + pytest.param(SmartSimStatus.STATUS_PAUSED, None, True, id="failure on paused"), + pytest.param( + SmartSimStatus.STATUS_RUNNING, None, True, id="failure on running" + ), ], ) def test_wlm_completion_handling(