From e1079325e70e918794871a400c25328a22d99b92 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 19 Jan 2024 19:35:10 +0100 Subject: [PATCH] Remove Cobalt support (#448) As we are not aware of any system still using the Cobalt workload manager, its support in SmartSim was terminated. [ committed by @al-rigazzi ] [ reviewed by @MattToast @ashao ] --- .wci.yml | 3 +- README.md | 7 +- conftest.py | 38 +--- doc/api/smartsim_api.rst | 39 +--- doc/changelog.rst | 7 +- doc/developer.rst | 10 +- doc/experiment.rst | 5 +- doc/launchers.rst | 40 +--- doc/overview.rst | 3 +- doc/testing.rst | 5 - smartsim/_core/_install/builder.py | 13 +- smartsim/_core/control/controller.py | 14 +- smartsim/_core/control/manifest.py | 6 +- smartsim/_core/entrypoints/redis.py | 6 +- .../_core/entrypoints/telemetrymonitor.py | 4 +- smartsim/_core/launcher/__init__.py | 2 - smartsim/_core/launcher/cobalt/__init__.py | 25 --- .../_core/launcher/cobalt/cobaltCommands.py | 25 --- .../_core/launcher/cobalt/cobaltLauncher.py | 207 ------------------ .../_core/launcher/cobalt/cobaltParser.py | 86 -------- smartsim/_core/launcher/step/__init__.py | 1 - smartsim/_core/launcher/step/alpsStep.py | 6 - smartsim/_core/launcher/step/cobaltStep.py | 106 --------- smartsim/_core/launcher/step/mpiStep.py | 2 +- smartsim/_core/launcher/stepInfo.py | 36 --- smartsim/database/orchestrator.py | 15 +- smartsim/entity/dbobject.py | 6 +- smartsim/experiment.py | 6 +- smartsim/log.py | 6 +- smartsim/settings/__init__.py | 2 - smartsim/settings/alpsSettings.py | 3 +- smartsim/settings/cobaltSettings.py | 171 --------------- smartsim/settings/settings.py | 3 - smartsim/wlm/__init__.py | 8 +- tests/full_wlm/test_generic_batch_launch.py | 10 - .../full_wlm/test_generic_orc_launch_batch.py | 21 -- tests/full_wlm/test_mpmd.py | 5 +- tests/install/test_builder.py | 4 +- .../test_simple_base_settings_on_wlm.py | 12 +- tests/test_cobalt_parser.py | 54 ----- tests/test_configs/cov/local_cov.cfg | 3 - tests/test_configs/cov/lsf_cov.cfg | 3 - tests/test_configs/cov/pbs_cov.cfg | 3 - tests/test_configs/cov/slurm_cov.cfg | 3 - tests/test_controller.py | 3 +- tests/test_dbnode.py | 8 +- tests/test_experiment.py | 6 + tests/test_orchestrator.py | 48 ---- tests/test_run_settings.py | 2 +- .../getting_started/getting_started.ipynb | 5 +- 50 files changed, 91 insertions(+), 1015 deletions(-) delete mode 100644 smartsim/_core/launcher/cobalt/__init__.py delete mode 100644 smartsim/_core/launcher/cobalt/cobaltCommands.py delete mode 100644 smartsim/_core/launcher/cobalt/cobaltLauncher.py delete mode 100644 smartsim/_core/launcher/cobalt/cobaltParser.py delete mode 100644 smartsim/_core/launcher/step/cobaltStep.py delete mode 100644 smartsim/settings/cobaltSettings.py delete mode 100644 tests/test_cobalt_parser.py diff --git a/.wci.yml b/.wci.yml index 55b5ddda1..9ca3f4f78 100644 --- a/.wci.yml +++ b/.wci.yml @@ -10,7 +10,7 @@ Machine Learning (ML) libraries, like PyTorch and TensorFlow, in combination with High Performance Computing (HPC) simulations and applications. SmartSim launches ML infrastructure on HPC systems alongside user workloads - and supports most HPC workload managers (e.g. Slurm, PBSPro, LSF, Cobalt). + and supports most HPC workload managers (e.g. Slurm, PBSPro, LSF). SmartSim also provides a set of client libraries in Python, C++, C, and Fortran. These client libraries allow users to send and receive data between user applications and the machine learning infrastructure. Moreover, the @@ -41,7 +41,6 @@ - Slurm - PBSPro - LSF - - Cobalt - Linux/MacOS transfer_protocols: - TCP/IP diff --git a/README.md b/README.md index 4754b547f..cfd8d4271 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,6 @@ launch capabilities for all applications. - Slurm - LSF - PBSPro - - Cobalt - Local (for laptops/single node, no batch) @@ -198,7 +197,7 @@ qsub -l select=3:ncpus=20 -l walltime=00:10:00 -l place=scatter -I -q bsub -Is -W 00:10 -nnodes 3 -P $SHELL ``` -This same script will run on a SLURM, PBS, LSF, or Cobalt system as the ``launcher`` +This same script will run on a SLURM, PBS, or LSF system as the ``launcher`` is set to `auto` in the [Experiment](https://www.craylabs.org/docs/api/smartsim_api.html#experiment) initialization. The run command like ``mpirun``, ``aprun`` or ``srun`` will be automatically detected from what is available on the @@ -277,8 +276,8 @@ print(exp.get_status(ensemble)) python hello_ensemble.py ``` -Similar to the interactive example, this same script will run on a SLURM, PBS, LSF, -or Cobalt system as the ``launcher`` is set to `auto` in the +Similar to the interactive example, this same script will run on a SLURM, PBS, +or LSF system as the ``launcher`` is set to `auto` in the [Experiment](https://www.craylabs.org/docs/api/smartsim_api.html#experiment) initialization. Local launching does not support batch workloads. diff --git a/conftest.py b/conftest.py index 387da1ee4..fa1c888b9 100644 --- a/conftest.py +++ b/conftest.py @@ -101,7 +101,7 @@ def print_test_configuration() -> None: def pytest_configure() -> None: pytest.test_launcher = test_launcher - pytest.wlm_options = ["slurm", "pbs", "cobalt", "lsf", "pals"] + pytest.wlm_options = ["slurm", "pbs", "lsf", "pals"] account = get_account() pytest.test_account = account pytest.test_device = test_device @@ -153,12 +153,7 @@ def kill_all_test_spawned_processes() -> None: def get_hostlist() -> t.Optional[t.List[str]]: global test_hostlist if not test_hostlist: - if "COBALT_NODEFILE" in os.environ: - try: - return _parse_hostlist_file(os.environ["COBALT_NODEFILE"]) - except FileNotFoundError: - return None - elif "PBS_NODEFILE" in os.environ and test_launcher == "pals": + if "PBS_NODEFILE" in os.environ and test_launcher == "pals": # with PALS, we need a hostfile even if `aprun` is available try: return _parse_hostlist_file(os.environ["PBS_NODEFILE"]) @@ -269,19 +264,6 @@ def get_base_run_settings( run_args = {"--np": ntasks, "--hostfile": host_file} run_args.update(kwargs) return RunSettings(exe, args, run_command="mpiexec", run_args=run_args) - if test_launcher == "cobalt": - if shutil.which("aprun"): - run_command = "aprun" - run_args = {"--pes": ntasks} - else: - run_command = "mpirun" - host_file = os.environ["COBALT_NODEFILE"] - run_args = {"-n": ntasks, "--hostfile": host_file} - run_args.update(kwargs) - settings = RunSettings( - exe, args, run_command=run_command, run_args=run_args - ) - return settings if test_launcher == "lsf": run_args = {"--np": ntasks, "--nrs": nodes} run_args.update(kwargs) @@ -289,7 +271,7 @@ def get_base_run_settings( return settings if test_launcher != "local": raise SSConfigError( - "Base run settings are available for Slurm, PBS, Cobalt, " + "Base run settings are available for Slurm, PBS, " f"and LSF, but launcher was {test_launcher}" ) # TODO allow user to pick aprun vs MPIrun @@ -320,18 +302,6 @@ def get_run_settings( run_args = {"np": ntasks, "hostfile": host_file} run_args.update(kwargs) return PalsMpiexecSettings(exe, args, run_args=run_args) - # TODO allow user to pick aprun vs MPIrun - if test_launcher == "cobalt": - if shutil.which("aprun"): - run_args = {"pes": ntasks} - run_args.update(kwargs) - return AprunSettings(exe, args, run_args=run_args) - - host_file = os.environ["COBALT_NODEFILE"] - run_args = {"n": ntasks, "hostfile": host_file} - run_args.update(kwargs) - return MpirunSettings(exe, args, run_args=run_args) - if test_launcher == "lsf": run_args = { "nrs": nodes, @@ -344,7 +314,7 @@ def get_run_settings( @staticmethod def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: - if test_launcher in ["pbs", "cobalt"]: + if test_launcher == "pbs": if not shutil.which("aprun"): hostlist = get_hostlist() else: diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index 5136c8aa5..adf7081ec 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -43,8 +43,8 @@ Settings are provided to ``Model`` and ``Ensemble`` objects to provide parameters for how a job should be executed. Some are specifically meant for certain launchers like ``SbatchSettings`` is solely meant for system using Slurm as a workload manager. -``MpirunSettings`` for OpenMPI based jobs is supported by Slurm, -PBSPro, and Cobalt. +``MpirunSettings`` for OpenMPI based jobs is supported by Slurm +and PBSPro. Types of Settings: @@ -60,7 +60,6 @@ Types of Settings: JsrunSettings SbatchSettings QsubBatchSettings - CobaltBatchSettings BsubBatchSettings Settings objects can accept a container object that defines a container @@ -137,7 +136,7 @@ AprunSettings ``AprunSettings`` can be used on any system that supports the Cray ALPS layer. SmartSim supports using ``AprunSettings`` -on PBSPro and Cobalt WLM systems. +on PBSPro WLM systems. ``AprunSettings`` can be used in interactive session (on allocation) and within batch launches (e.g., ``QsubBatchSettings``) @@ -204,7 +203,7 @@ MpirunSettings ``MpirunSettings`` are for launching with OpenMPI. ``MpirunSettings`` are -supported on Slurm, PBSpro, and Cobalt. +supported on Slurm and PBSpro. .. autosummary:: @@ -231,7 +230,7 @@ MpiexecSettings ``MpiexecSettings`` are for launching with OpenMPI's ``mpiexec``. ``MpirunSettings`` are -supported on Slurm, PBSpro, and Cobalt. +supported on Slurm and PBSpro. .. autosummary:: @@ -258,7 +257,7 @@ OrterunSettings ``OrterunSettings`` are for launching with OpenMPI's ``orterun``. ``OrterunSettings`` are -supported on Slurm, PBSpro, and Cobalt. +supported on Slurm and PBSpro. .. autosummary:: @@ -336,32 +335,6 @@ be launched as a batch on PBSPro systems. :members: -.. _cqsub_api: - - -CobaltBatchSettings -------------------- - -``CobaltBatchSettings`` are used to configure jobs that should -be launched as a batch on Cobalt Systems. They closely mimic -that of the ``QsubBatchSettings`` for PBSPro. - - -.. autosummary:: - - CobaltBatchSettings.set_account - CobaltBatchSettings.set_batch_command - CobaltBatchSettings.set_nodes - CobaltBatchSettings.set_queue - CobaltBatchSettings.set_walltime - CobaltBatchSettings.format_batch_args - -.. autoclass:: CobaltBatchSettings - :inherited-members: - :undoc-members: - :members: - - .. _bsub_api: BsubBatchSettings diff --git a/doc/changelog.rst b/doc/changelog.rst index bdcf6cb5f..ff2c662ce 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -19,12 +19,16 @@ To be released at some future point in time Description +- Drop Cobalt support - Override the sphinx-tabs extension background color - Updated SmartSim's machine learning backends - Added ONNX support for Python 3.10 Detailed Notes +- As the Cobalt workload manager is not used on any system we are aware of, + its support in SmartSim was terminated and classes such as `CobaltLauncher` have + been removed. (SmartSim-PR448_) - The sphinx-tabs documentation extension uses a white background for the tabs component. A custom CSS for those components to inherit the overall theme color has been added. (SmartSim-PR453_) @@ -34,6 +38,7 @@ Detailed Notes (SmartSim-PR451_) +.. _SmartSim-PR448: https://github.com/CrayLabs/SmartSim/pull/448 .. _SmartSim-PR451: https://github.com/CrayLabs/SmartSim/pull/451 .. _SmartSim-PR453: https://github.com/CrayLabs/SmartSim/pull/453 @@ -454,7 +459,7 @@ Expand Machine Learning Library Support: Expand Launcher Setting Options: - - Add ability to use base ``RunSettings`` on a Slurm, PBS, or Cobalt launchers (SmartSim-PR90_) + - Add ability to use base ``RunSettings`` on a Slurm, or PBS launchers (SmartSim-PR90_) - Add ability to use base ``RunSettings`` on LFS launcher (SmartSim-PR108_) Deprecations and Breaking Changes diff --git a/doc/developer.rst b/doc/developer.rst index 4009819c3..632ee8d45 100644 --- a/doc/developer.rst +++ b/doc/developer.rst @@ -84,14 +84,14 @@ Local ===== There are two levels of testing in SmartSim. The first runs by default and does -not launch any jobs out onto a system through a workload manager like Cobalt. +not launch any jobs out onto a system through a workload manager like Slurm. If any of the above commands are used, the test suite will run the "light" test suite by default. -PBSPro, Slurm, Cobalt, LSF -========================== +PBSPro, Slurm, LSF +================== To run the full test suite, users will have to be on a system with one of the above workload managers. Additionally, users will need to obtain an allocation @@ -105,9 +105,6 @@ of at least 3 nodes. # for PBSPro (with aprun) qsub -l select=3 -l place=scatter -l walltime=00:10:00 -q queue - # for Cobalt (with aprun) - qsub -n 3 -t 00:10:00 -A account -q queue -I - # for LSF (with jsrun) bsub -Is -W 00:30 -nnodes 3 -P project $SHELL @@ -117,7 +114,6 @@ Once in an iterative allocation, users will need to set the test launcher environment variable: ``SMARTSIM_TEST_LAUNCHER`` to one of the following values - slurm - - cobalt - pbs - lsf - local diff --git a/doc/experiment.rst b/doc/experiment.rst index f7950d6d6..986db4cad 100644 --- a/doc/experiment.rst +++ b/doc/experiment.rst @@ -38,8 +38,8 @@ available compute resources on the system. Each launcher supports specific types of ``RunSettings``. - :ref:`SrunSettings ` for Slurm - - :ref:`AprunSettings ` for PBSPro and Cobalt - - :ref:`MpirunSettings ` for OpenMPI with `mpirun` on PBSPro, Cobalt, LSF, and Slurm + - :ref:`AprunSettings ` for PBSPro + - :ref:`MpirunSettings ` for OpenMPI with `mpirun` on PBSPro, LSF, and Slurm - :ref:`JsrunSettings ` for LSF These settings can be manually specified by the user, or auto-detected by the @@ -181,7 +181,6 @@ workload manager and available compute resources. - :ref:`SbatchSettings ` for Slurm - :ref:`QsubBatchSettings ` for PBSPro - - :ref:`CobaltBatchSettings ` for Cobalt - :ref:`BsubBatchSettings ` for LSF If it only passed ``RunSettings``, ``Ensemble``, objects will require either diff --git a/doc/launchers.rst b/doc/launchers.rst index 7d0c9970f..22425071e 100644 --- a/doc/launchers.rst +++ b/doc/launchers.rst @@ -16,9 +16,8 @@ SmartSim currently supports 5 `launchers`: 1. ``local``: for single-node, workstation, or laptop 2. ``slurm``: for systems using the Slurm scheduler 3. ``pbs``: for systems using the PBSpro scheduler - 4. ``cobalt``: for systems using the Cobalt scheduler - 5. ``lsf``: for systems using the LSF scheduler - 6. ``auto``: have SmartSim auto-detect the launcher to use. + 4. ``lsf``: for systems using the LSF scheduler + 5. ``auto``: have SmartSim auto-detect the launcher to use. To specify a specific launcher, one argument needs to be provided to the ``Experiment`` initialization. @@ -30,7 +29,6 @@ to the ``Experiment`` initialization. exp = Experiment("name-of-experiment", launcher="local") # local launcher exp = Experiment("name-of-experiment", launcher="slurm") # Slurm launcher exp = Experiment("name-of-experiment", launcher="pbs") # PBSpro launcher - exp = Experiment("name-of-experiment", launcher="cobalt") # Cobalt launcher exp = Experiment("name-of-experiment", launcher="lsf") # LSF launcher exp = Experiment("name-of-experiment", launcher="auto") # auto-detect launcher @@ -219,42 +217,10 @@ creation. --------------------------------------------------------------------- -Cobalt -====== - -The Cobalt Launcher works just like the PBSPro launcher and -is compatible with ALPS and OpenMPI workloads as well. - -To use the Cobalt launcher, specify at ``Experiment`` initialization: - -.. code-block:: python - - from smartsim import Experiment - - exp = Experiment("MOM6-double-gyre", launcher="cobalt") - - -Running on Cobalt ------------------ - -The Cobalt launcher supports three types of ``RunSettings``: - 1. :ref:`AprunSettings ` - 2. :ref:`MpirunSettings ` - 3. :ref:`MpiexecSettings ` - -As well as batch settings for ``qsub`` through: - 1. :ref:`CobaltBatchSettings ` - -Both supported ``RunSettings`` types above can be added -to a ``CobaltBatchSettings`` batch workload through ``Ensemble`` -creation. - ---------------------------------------------------------------------- - LSF === -The LSF Launcher works like the PBSPro and Cobalt launchers and +The LSF Launcher works like the PBSPro launcher and is compatible with LSF and OpenMPI workloads. To use the LSF launcher, specify at ``Experiment`` initialization: diff --git a/doc/overview.rst b/doc/overview.rst index 3ef046bb0..241d54eca 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -61,8 +61,7 @@ The key features of the IL are: - An API to start, monitor, and stop HPC jobs from Python or from a Jupyter notebook. - Automated deployment of in-memory data staging (`Redis `_) and computational storage (`RedisAI `_). - - Programmatic launches of batch and in-allocation jobs on PBS, Slurm, LSF, - and Cobalt systems. + - Programmatic launches of batch and in-allocation jobs on PBS, Slurm, and LSF systems. - Creating and configuring ensembles of workloads with isolated communication channels. The IL can configure and launch batch jobs as well as jobs within interactive diff --git a/doc/testing.rst b/doc/testing.rst index bdaa473d7..ccb2db3c2 100644 --- a/doc/testing.rst +++ b/doc/testing.rst @@ -78,9 +78,6 @@ Examples of how to obtain allocations on systems with the launchers: # for PBSPro (with aprun) qsub -l select=4 -l place=scatter -l walltime=00:10:00 -q queue - # for Cobalt (with aprun) - qsub -n 4 -t 00:10:00 -A account -q queue -I - # for LSF (with jsrun) bsub -Is -W 00:30 -nnodes 4 -P project $SHELL @@ -91,7 +88,6 @@ launcher environment variable: ``SMARTSIM_TEST_LAUNCHER`` to one of the following values - slurm - - cobalt - pbs - lsf - local @@ -273,4 +269,3 @@ The actions are defined using yaml files are are located in the Each pull request, push and merge the test suite for SmartRedis and SmartSim are run. For SmartSim, this is the ``local`` test suite with the local launcher. - diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index ef234228b..dba55bed5 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -336,9 +336,12 @@ class _RAIBuildDependency(ABC): @property @abstractmethod - def __rai_dependency_name__(self) -> str: ... + def __rai_dependency_name__(self) -> str: + ... + @abstractmethod - def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: ... + def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + ... def _place_rai_dep_at( @@ -677,7 +680,8 @@ def _threaded_map(fn: t.Callable[[_T], _U], items: t.Iterable[_T]) -> t.Sequence class _WebLocation(ABC): @property @abstractmethod - def url(self) -> str: ... + def url(self) -> str: + ... class _WebGitRepository(_WebLocation): @@ -731,7 +735,8 @@ class _ExtractableWebArchive(_WebArchive, ABC): @abstractmethod def _extract_download( self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] - ) -> None: ... + ) -> None: + ... def extract(self, target: t.Union[str, "os.PathLike[str]"]) -> None: with tempfile.TemporaryDirectory() as tmp_dir: diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index af7c4293e..e4ad012a8 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -63,13 +63,7 @@ from ...servertype import CLUSTERED, STANDALONE from ...status import STATUS_CANCELLED, STATUS_RUNNING, TERMINAL_STATUSES from ..config import CONFIG -from ..launcher import ( - CobaltLauncher, - LocalLauncher, - LSFLauncher, - PBSLauncher, - SlurmLauncher, -) +from ..launcher import LocalLauncher, LSFLauncher, PBSLauncher, SlurmLauncher from ..launcher.launcher import Launcher from ..utils import check_cluster_status, create_cluster, serialize from .job import Job @@ -318,7 +312,7 @@ def get_entity_list_status( def init_launcher(self, launcher: str) -> None: """Initialize the controller with a specific type of launcher. - SmartSim currently supports slurm, pbs(pro), cobalt, lsf, + SmartSim currently supports slurm, pbs(pro), lsf, and local launching :param launcher: which launcher to initialize @@ -331,7 +325,6 @@ def init_launcher(self, launcher: str) -> None: "slurm": SlurmLauncher, "pbs": PBSLauncher, "pals": PBSLauncher, - "cobalt": CobaltLauncher, "lsf": LSFLauncher, "local": LocalLauncher, } @@ -882,7 +875,8 @@ def __init__(self, model: Model) -> None: self.entities = [model] self.batch_settings = model.batch_settings - def _initialize_entities(self, **kwargs: t.Any) -> None: ... + def _initialize_entities(self, **kwargs: t.Any) -> None: + ... def _look_up_launched_data( diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 452a52f09..9962d61b4 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -296,9 +296,9 @@ class LaunchedManifestBuilder(t.Generic[_T]): _ensembles: t.List[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]]] = field( default_factory=list, init=False ) - _databases: t.List[t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]]] = ( - field(default_factory=list, init=False) - ) + _databases: t.List[ + t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]] + ] = field(default_factory=list, init=False) @property def exp_telemetry_subdirectory(self) -> pathlib.Path: diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index ef9911829..7262a5996 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -79,7 +79,8 @@ def print_summary( cmd: t.List[str], network_interface: str, shard_data: LaunchedShardData ) -> None: print( - textwrap.dedent(f"""\ + textwrap.dedent( + f"""\ ----------- Running Command ---------- COMMAND: {' '.join(cmd)} IPADDRESS: {shard_data.hostname} @@ -89,7 +90,8 @@ def print_summary( --------------- Output --------------- - """), + """ + ), flush=True, ) diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py index 9ff838f1c..ead72aa9b 100644 --- a/smartsim/_core/entrypoints/telemetrymonitor.py +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -49,7 +49,6 @@ from smartsim._core.config import CONFIG from smartsim._core.control.job import JobEntity, _JobKey from smartsim._core.control.jobmanager import JobManager -from smartsim._core.launcher.cobalt.cobaltLauncher import CobaltLauncher from smartsim._core.launcher.launcher import Launcher from smartsim._core.launcher.local.local import LocalLauncher from smartsim._core.launcher.lsf.lsfLauncher import LSFLauncher @@ -325,14 +324,13 @@ def __init__( self._launcher_map: t.Dict[str, t.Type[Launcher]] = { "slurm": SlurmLauncher, "pbs": PBSLauncher, - "cobalt": CobaltLauncher, "lsf": LSFLauncher, "local": LocalLauncher, } def init_launcher(self, launcher: str) -> Launcher: """Initialize the controller with a specific type of launcher. - SmartSim currently supports slurm, pbs(pro), cobalt, lsf, + SmartSim currently supports slurm, pbs(pro), lsf, and local launching :param launcher: which launcher to initialize diff --git a/smartsim/_core/launcher/__init__.py b/smartsim/_core/launcher/__init__.py index 6e1aa724e..d7c68bfb6 100644 --- a/smartsim/_core/launcher/__init__.py +++ b/smartsim/_core/launcher/__init__.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .cobalt.cobaltLauncher import CobaltLauncher from .launcher import Launcher from .local.local import LocalLauncher from .lsf.lsfLauncher import LSFLauncher @@ -33,7 +32,6 @@ __all__ = [ "Launcher", - "CobaltLauncher", "LocalLauncher", "LSFLauncher", "PBSLauncher", diff --git a/smartsim/_core/launcher/cobalt/__init__.py b/smartsim/_core/launcher/cobalt/__init__.py deleted file mode 100644 index bf6fd954c..000000000 --- a/smartsim/_core/launcher/cobalt/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/launcher/cobalt/cobaltCommands.py b/smartsim/_core/launcher/cobalt/cobaltCommands.py deleted file mode 100644 index bf6fd954c..000000000 --- a/smartsim/_core/launcher/cobalt/cobaltCommands.py +++ /dev/null @@ -1,25 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/launcher/cobalt/cobaltLauncher.py b/smartsim/_core/launcher/cobalt/cobaltLauncher.py deleted file mode 100644 index 56ebe12cc..000000000 --- a/smartsim/_core/launcher/cobalt/cobaltLauncher.py +++ /dev/null @@ -1,207 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import time -import typing as t - -import psutil - -from smartsim._core.launcher.step import Step -from smartsim.settings import ( - AprunSettings, - CobaltBatchSettings, - MpiexecSettings, - MpirunSettings, - OrterunSettings, - RunSettings, - SettingsBase, -) - -from ....error import LauncherError -from ....log import get_logger -from ....status import STATUS_CANCELLED, STATUS_COMPLETED -from ...config import CONFIG -from ..launcher import WLMLauncher -from ..pbs.pbsCommands import qdel, qstat -from ..step import ( - AprunStep, - CobaltBatchStep, - LocalStep, - MpiexecStep, - MpirunStep, - OrterunStep, - Step, -) -from ..stepInfo import CobaltStepInfo, StepInfo -from .cobaltParser import parse_cobalt_step_id, parse_cobalt_step_status, parse_qsub_out - -logger = get_logger(__name__) - - -class CobaltLauncher(WLMLauncher): - """This class encapsulates the functionality needed - to launch jobs on systems that use Cobalt as a workload manager. - - All WLM launchers are capable of launching managed and unmanaged - jobs. Managed jobs are queried through interaction with with WLM, - in this case Cobalt. Unmanaged jobs are held in the TaskManager - and are managed through references to their launching process ID - i.e. a psutil.Popen object - """ - - def __init__(self) -> None: - super().__init__() - self.user = psutil.Process().username() - - @property - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: - # RunSettings types supported by this launcher - return { - AprunSettings: AprunStep, - CobaltBatchSettings: CobaltBatchStep, - MpirunSettings: MpirunStep, - MpiexecSettings: MpiexecStep, - OrterunSettings: OrterunStep, - RunSettings: LocalStep, - } - - def run(self, step: Step) -> t.Optional[str]: - """Run a job step through Cobalt - - :param step: a job step instance - :type step: Step - :raises LauncherError: if launch fails - :return: job step id if job is managed - :rtype: str - """ - if not self.task_manager.actively_monitoring: - self.task_manager.start() - - cmd_list = step.get_launch_cmd() - step_id = None - task_id = None - if isinstance(step, CobaltBatchStep): - # wait for batch step to submit successfully - return_code, out, err = self.task_manager.start_and_wait(cmd_list, step.cwd) - if return_code != 0: - raise LauncherError( - f"Cobalt qsub batch submission failed\n {out}\n {err}" - ) - if out: - step_id = parse_qsub_out(out) - logger.debug(f"Gleaned batch job id: {step_id} for {step.name}") - else: - # aprun doesn't direct output for us. - out, err = step.get_output_files() - - # pylint: disable-next=consider-using-with - output = open(out, "w+", encoding="utf-8") - # pylint: disable-next=consider-using-with - error = open(err, "w+", encoding="utf-8") - - task_id = self.task_manager.start_task( - cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() - ) - - # if batch submission did not successfully retrieve job ID - if not step_id and step.managed: - step_id = self._get_cobalt_step_id(step) - - self.step_mapping.add(step.name, step_id, task_id, step.managed) - return step_id - - def stop(self, step_name: str) -> StepInfo: - """Step a job step - - :param step_name: name of the job to stop - :type step_name: str - :return: update for job due to cancel - :rtype: StepInfo - """ - stepmap = self.step_mapping[step_name] - if stepmap.managed: - qdel_rc, _, err = qdel([str(stepmap.step_id)]) - if qdel_rc != 0: - logger.warning(f"Unable to cancel job step {step_name}\n {err}") - if stepmap.task_id: - self.task_manager.remove_task(str(stepmap.task_id)) - else: - if stepmap.task_id: - self.task_manager.remove_task(str(stepmap.task_id)) - - _, step_info = self.get_step_update([step_name])[0] - if not step_info: - raise LauncherError(f"Could not get step_info for job step {step_name}") - step_info.status = STATUS_CANCELLED # set status to cancelled instead of failed - return step_info - - def _get_cobalt_step_id(self, step: Step, interval: int = 2) -> str: - """Get the step_id of a step from qstat (rarely used) - - Parses cobalt qstat output by looking for the step name - """ - step_id = None - trials = CONFIG.wlm_trials - while trials > 0: - output, _ = qstat(["--header", "JobName:JobId", "-u", self.user]) - step_id = parse_cobalt_step_id(output, step.name) - if step_id: - break - else: - time.sleep(interval) - trials -= 1 - if not step_id: - raise LauncherError("Could not find id of launched job step") - return step_id - - def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: - """Get step updates for WLM managed jobs - - :param step_ids: list of job step ids - :type step_ids: list[str] - :return: list of updates for managed jobs - :rtype: list[StepInfo] - """ - args = ["--header", "JobId:State", "-u", self.user] - args.extend(step_ids) - qstat_out, _ = qstat(args) - - stats = [ - parse_cobalt_step_status(qstat_out, str(step_id)) for step_id in step_ids - ] - # create CobaltStepInfo objects to return - updates: t.List[StepInfo] = [] - for stat, _ in zip(stats, step_ids): - info = CobaltStepInfo(stat, None) # returncode not logged by Cobalt - - if info.status == STATUS_COMPLETED: - info.returncode = 0 - - updates.append(info) - return updates - - def __str__(self) -> str: - return "Cobalt" diff --git a/smartsim/_core/launcher/cobalt/cobaltParser.py b/smartsim/_core/launcher/cobalt/cobaltParser.py deleted file mode 100644 index c76509d36..000000000 --- a/smartsim/_core/launcher/cobalt/cobaltParser.py +++ /dev/null @@ -1,86 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -def parse_cobalt_step_status(output: str, step_id: str) -> str: - """ - Parse and return the status of a cobalt step - - :param output: output qstat - :type output: str - :param step_id: the id of the step to query - :type step_id: str - :rtype: str - """ - status = "NOTFOUND" - for line in output.split("\n"): - fields = line.split() - if len(fields) >= 2: - if fields[0] == step_id: - status = fields[1] - break - return status - - -def parse_cobalt_step_id(output: str, step_name: str) -> str: - """Parse and return the step id from a cobalt qstat command - - :param output: output qstat - :type output: str - :param step_name: the name of the step to query - :type step_name: str - :return: the step_id - :rtype: str - """ - step_id = "" - for line in output.split("\n"): - fields = line.split() - if len(fields) >= 2: - if fields[0] == step_name: - step_id = fields[1] - break - return step_id - - -def parse_qsub_out(output: str) -> str: - """ - Parse and return the step id from a cobalt qsub command - - :param output: output qstat - :type output: str - :return: the step_id - :rtype: str - """ - step_id = "" - for line in output.split("\n"): - try: - value = line.strip() - int(value) # if the cast works, return original string - step_id = value - break - except ValueError: - continue - return step_id diff --git a/smartsim/_core/launcher/step/__init__.py b/smartsim/_core/launcher/step/__init__.py index 98dd1a921..2b5f106cb 100644 --- a/smartsim/_core/launcher/step/__init__.py +++ b/smartsim/_core/launcher/step/__init__.py @@ -25,7 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from .alpsStep import AprunStep -from .cobaltStep import CobaltBatchStep from .localStep import LocalStep from .lsfStep import BsubBatchStep, JsrunStep from .mpiStep import MpiexecStep, MpirunStep, OrterunStep diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index d675f703f..9629ee3d5 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -113,12 +113,6 @@ def _set_alloc(self) -> None: logger.debug( f"Running on PBS allocation {self.alloc} gleaned from user environment" ) - elif "COBALT_JOBID" in os.environ: - self.alloc = os.environ["COBALT_JOBID"] - logger.debug( - f"Running on Cobalt allocation {self.alloc} gleaned " - "from user environment" - ) else: raise AllocationError( "No allocation specified or found and not running in batch" diff --git a/smartsim/_core/launcher/step/cobaltStep.py b/smartsim/_core/launcher/step/cobaltStep.py deleted file mode 100644 index b224121e2..000000000 --- a/smartsim/_core/launcher/step/cobaltStep.py +++ /dev/null @@ -1,106 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import os -import stat -import typing as t - -from ....log import get_logger -from ....settings import CobaltBatchSettings -from .step import Step - -logger = get_logger(__name__) - - -class CobaltBatchStep(Step): - def __init__( - self, name: str, cwd: str, batch_settings: CobaltBatchSettings - ) -> None: - """Initialize a Cobalt qsub step - - :param name: name of the entity to launch - :type name: str - :param cwd: path to launch dir - :type cwd: str - :param batch_settings: batch settings for entity - :type batch_settings: CobaltBatchSettings - """ - super().__init__(name, cwd, batch_settings) - self.step_cmds: t.List[t.List[str]] = [] - self.managed = True - self.batch_settings = batch_settings - - def get_launch_cmd(self) -> t.List[str]: - """Get the launch command for the batch - - :return: launch command for the batch - :rtype: list[str] - """ - script = self._write_script() - return [self.batch_settings.batch_cmd, script] - - def add_to_batch(self, step: Step) -> None: - """Add a job step to this batch - - :param step: a job step instance e.g. SrunStep - :type step: Step - """ - launch_cmd = step.get_launch_cmd() - self.step_cmds.append(launch_cmd) - logger.debug(f"Added step command to batch for {step.name}") - - def _write_script(self) -> str: - """Write the batch script - - :return: batch script path after writing - :rtype: str - """ - batch_script = self.get_step_file(ending=".sh") - cobalt_debug = self.get_step_file(ending=".cobalt-debug") - output, error = self.get_output_files() - with open(batch_script, "w", encoding="utf-8") as script_file: - script_file.write("#!/bin/bash\n") - script_file.write(f"#COBALT -o {output}\n") - script_file.write(f"#COBALT -e {error}\n") - script_file.write(f"#COBALT --cwd {self.cwd}\n") - script_file.write(f"#COBALT --jobname {self.name}\n") - script_file.write(f"#COBALT --debuglog {cobalt_debug}\n") - - # add additional sbatch options - for opt in self.batch_settings.format_batch_args(): - script_file.write(f"#COBALT {opt}\n") - - for cmd in self.batch_settings.preamble: - script_file.write(f"{cmd}\n") - - for i, step_cmd in enumerate(self.step_cmds): - script_file.write("\n") - script_file.write(f"{' '.join((step_cmd))} &\n") - if i == len(self.step_cmds) - 1: - script_file.write("\n") - script_file.write("wait\n") - os.chmod(batch_script, stat.S_IXUSR | stat.S_IWUSR | stat.S_IRUSR) - return batch_script diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 7971fb732..7bbeed03b 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -57,7 +57,7 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings) -> None: self._set_alloc() self.run_settings = run_settings - _supported_launchers = ["PBS", "COBALT", "SLURM", "LSB"] + _supported_launchers = ["PBS", "SLURM", "LSB"] @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: diff --git a/smartsim/_core/launcher/stepInfo.py b/smartsim/_core/launcher/stepInfo.py index b33dac5ec..a1f500fc1 100644 --- a/smartsim/_core/launcher/stepInfo.py +++ b/smartsim/_core/launcher/stepInfo.py @@ -195,42 +195,6 @@ def __init__( ) -class CobaltStepInfo(StepInfo): # cov-cobalt - @property - def mapping(self) -> t.Dict[str, str]: - return { - "running": STATUS_RUNNING, - "queued": STATUS_PAUSED, - "starting": STATUS_PAUSED, - "dep_hold": STATUS_PAUSED, - "user_hold": STATUS_PAUSED, - "admin_hold": STATUS_PAUSED, - "dep_fail": STATUS_FAILED, # unsure of this one - "terminating": STATUS_COMPLETED, - "killing": STATUS_COMPLETED, - "exiting": STATUS_COMPLETED, - } - - def __init__( - self, - status: str = "", - returncode: t.Optional[int] = None, - output: t.Optional[str] = None, - error: t.Optional[str] = None, - ) -> None: - if status == "NOTFOUND": - # returncode not logged by Cobalt - # if job has exited the queue then we consider it "completed" - # this should only be hit in the case when job exits abnormally fast - smartsim_status = "Completed" - returncode = 0 - else: - smartsim_status = self._get_smartsim_status(status) - super().__init__( - smartsim_status, status, returncode, output=output, error=error - ) - - class LSFBatchStepInfo(StepInfo): # cov-lsf @property def mapping(self) -> t.Dict[str, str]: diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 07a1a1bfd..31bc1be6c 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -44,7 +44,6 @@ from ..settings import ( AprunSettings, BsubBatchSettings, - CobaltBatchSettings, JsrunSettings, MpiexecSettings, MpirunSettings, @@ -64,7 +63,6 @@ "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], "pals": ["mpiexec"], - "cobalt": ["aprun", "mpirun", "mpiexec"], "lsf": ["jsrun"], "local": [""], } @@ -389,7 +387,7 @@ def set_cpus(self, num_cpus: int) -> None: :type num_cpus: int """ if self.batch: - if self.launcher in ["pbs", "cobalt"]: + if self.launcher == "pbs": if hasattr(self, "batch_settings") and self.batch_settings: if hasattr(self.batch_settings, "set_ncpus"): self.batch_settings.set_ncpus(num_cpus) @@ -938,17 +936,6 @@ def _fill_reserved(self) -> None: "chdir", "D", ] - self._reserved_batch_args[CobaltBatchSettings] = [ - "cwd", - "error", - "e", - "output", - "o", - "outputprefix", - "N", - "l", - "jobname", - ] self._reserved_batch_args[QsubBatchSettings] = ["e", "o", "N", "l"] self._reserved_run_args[JsrunSettings] = [ "chdir", diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index bebedb12c..5b9fb4ec6 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -49,9 +49,9 @@ def __init__( ) -> None: self.name = name self.func = func - self.file: t.Optional[Path] = ( - None # Need to have this explicitly to check on it - ) + self.file: t.Optional[ + Path + ] = None # Need to have this explicitly to check on it if file_path: self.file = self._check_filepath(file_path) self.device = self._check_device(device) diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 0b9365099..b81e6bae1 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -31,6 +31,8 @@ from tabulate import tabulate +from smartsim.error.errors import SSUnsupportedError + from ._core import Controller, Generator, Manifest from ._core.utils import init_default from .database import Orchestrator @@ -120,7 +122,7 @@ def __init__( :param exp_path: path to location of ``Experiment`` directory if generated :type exp_path: str, optional :param launcher: type of launcher being used, options are "slurm", "pbs", - "cobalt", "lsf", or "local". If set to "auto", + "lsf", or "local". If set to "auto", an attempt will be made to find an available launcher on the system. Defaults to "local" @@ -137,6 +139,8 @@ def __init__( if launcher == "auto": launcher = detect_launcher() + if launcher == "cobalt": + raise SSUnsupportedError("Cobalt launcher is no longer supported.") self._control = Controller(launcher=launcher) self._launcher = launcher.lower() diff --git a/smartsim/log.py b/smartsim/log.py index 44e26339e..9d01a57c5 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -39,9 +39,9 @@ # constants DEFAULT_DATE_FORMAT: t.Final[str] = "%H:%M:%S" -DEFAULT_LOG_FORMAT: t.Final[str] = ( - "%(asctime)s %(hostname)s %(name)s[%(process)d] %(levelname)s %(message)s" -) +DEFAULT_LOG_FORMAT: t.Final[ + str +] = "%(asctime)s %(hostname)s %(name)s[%(process)d] %(levelname)s %(message)s" EXPERIMENT_LOG_FORMAT = DEFAULT_LOG_FORMAT.replace("s[%", "s {%(exp_path)s} [%") # configure colored loggs diff --git a/smartsim/settings/__init__.py b/smartsim/settings/__init__.py index 542aeab1d..a3f1eeaed 100644 --- a/smartsim/settings/__init__.py +++ b/smartsim/settings/__init__.py @@ -26,7 +26,6 @@ from .alpsSettings import AprunSettings from .base import RunSettings, SettingsBase -from .cobaltSettings import CobaltBatchSettings from .containers import Container, Singularity from .lsfSettings import BsubBatchSettings, JsrunSettings from .mpiSettings import MpiexecSettings, MpirunSettings, OrterunSettings @@ -36,7 +35,6 @@ __all__ = [ "AprunSettings", - "CobaltBatchSettings", "BsubBatchSettings", "JsrunSettings", "MpirunSettings", diff --git a/smartsim/settings/alpsSettings.py b/smartsim/settings/alpsSettings.py index b36c3d333..c3df1fb74 100644 --- a/smartsim/settings/alpsSettings.py +++ b/smartsim/settings/alpsSettings.py @@ -43,8 +43,7 @@ def __init__( ): """Settings to run job with ``aprun`` command - ``AprunSettings`` can be used for both the `pbs` and `cobalt` - launchers. + ``AprunSettings`` can be used for the `pbs` launcher. :param exe: executable :type exe: str diff --git a/smartsim/settings/cobaltSettings.py b/smartsim/settings/cobaltSettings.py deleted file mode 100644 index 5a0e07b40..000000000 --- a/smartsim/settings/cobaltSettings.py +++ /dev/null @@ -1,171 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import typing as t - -from .base import BatchSettings - - -class CobaltBatchSettings(BatchSettings): - def __init__( - self, - nodes: t.Optional[int] = None, - time: str = "", - queue: t.Optional[str] = None, - account: t.Optional[str] = None, - batch_args: t.Optional[t.Dict[str, t.Optional[str]]] = None, - **kwargs: t.Any, - ) -> None: - """Specify settings for a Cobalt ``qsub`` batch launch - - If the argument doesn't have a parameter, put None - as the value. e.g. {'exclusive': None} - - Initialization values provided (nodes, time, account) - will overwrite the same arguments in ``batch_args`` if present - - :param nodes: number of nodes, defaults to None - :type nodes: int, optional - :param time: walltime for job, e.g. "10:00:00" for 10 hours, - defaults to empty str - :type time: str, optional - :param queue: queue to launch job in, defaults to None - :type queue: str, optional - :param account: account for job, defaults to None - :type account: str, optional - :param batch_args: extra batch arguments, defaults to None - :type batch_args: dict[str, str], optional - """ - super().__init__( - "qsub", - batch_args=batch_args, - nodes=nodes, - account=account, - queue=queue, - time=time, - **kwargs, - ) - - def set_walltime(self, walltime: str) -> None: - """Set the walltime of the job - - format = "HH:MM:SS" - - Cobalt walltime can also be specified with number - of minutes. - - :param walltime: wall time - :type walltime: str - """ - # TODO check for formatting errors here - # TODO catch existing "t" in batch_args - if walltime: - self.batch_args["time"] = walltime - - def set_nodes(self, num_nodes: int) -> None: - """Set the number of nodes for this batch job - - :param num_nodes: number of nodes - :type num_nodes: int - """ - # TODO catch existing "n" in batch_args - if num_nodes: - self.batch_args["nodecount"] = str(int(num_nodes)) - - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: - """Specify the hostlist for this job - - :param host_list: hosts to launch on - :type host_list: str | list[str] - :raises TypeError: if not str or list of str - """ - if isinstance(host_list, str): - host_list = [host_list.strip()] - if not isinstance(host_list, list): - raise TypeError("host_list argument must be a list of strings") - if not all(isinstance(host, str) for host in host_list): - raise TypeError("host_list argument must be list of strings") - hosts = ",".join(host_list) - self.batch_args["attrs"] = f"location={hosts}" - - def set_tasks(self, num_tasks: int) -> None: - """Set total number of processes to start - - :param num_tasks: number of processes - :type num_tasks: int - """ - self.batch_args["proccount"] = str(int(num_tasks)) - - def set_queue(self, queue: str) -> None: - """Set the queue for the batch job - - :param queue: queue name - :type queue: str - """ - # TODO catch existing "q" in batch args - if queue: - self.batch_args["queue"] = str(queue) - - def set_account(self, account: str) -> None: - """Set the account for this batch job - - :param acct: account id - :type acct: str - """ - # TODO catch existing "A" in batch_args - if account: - self.batch_args["project"] = account - - def format_batch_args(self) -> t.List[str]: - """Get the formatted batch arguments for a preview - - :return: list of batch arguments for Sbatch - :rtype: list[str] - """ - restricted = [ - "o", - "output", # output is determined by interface - "O", - "outputprefix", # step name is output prefix - "e", - "error", # error is determined by interface - "cwd", # cwd is determined by interface - "jobname", # step name is jobname - ] - opts = [] - for opt, value in self.batch_args.items(): - if opt not in restricted: - # attach "-" prefix if argument is 1 character otherwise "--" - short_arg = bool(len(str(opt)) == 1) - prefix = "-" if short_arg else "--" - if not value: - opts += [prefix + opt] - else: - if short_arg: - opts += [prefix + opt, str(value)] - else: - opts += [" ".join((prefix + opt, str(value)))] - return opts diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index b09286e8c..cb48790f9 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -31,7 +31,6 @@ from ..settings import ( AprunSettings, BsubBatchSettings, - CobaltBatchSettings, Container, JsrunSettings, MpiexecSettings, @@ -81,7 +80,6 @@ def create_batch_settings( """ # all supported batch class implementations by_launcher: t.Dict[str, t.Callable[..., base.BatchSettings]] = { - "cobalt": CobaltBatchSettings, "pbs": QsubBatchSettings, "slurm": SbatchSettings, "lsf": BsubBatchSettings, @@ -164,7 +162,6 @@ def create_run_settings( "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], "pals": ["mpiexec"], - "cobalt": ["aprun", "mpirun", "mpiexec"], "lsf": ["jsrun", "mpirun", "mpiexec"], "local": [""], } diff --git a/smartsim/wlm/__init__.py b/smartsim/wlm/__init__.py index d7dd298be..fd694f5f2 100644 --- a/smartsim/wlm/__init__.py +++ b/smartsim/wlm/__init__.py @@ -36,7 +36,7 @@ def detect_launcher() -> str: """Detect available launcher.""" - # Precedence: PBS, Cobalt, LSF, Slurm, local + # Precedence: PBS, LSF, Slurm, local if which("qsub") and which("qstat") and which("qdel"): qsub_version = run( ["qsub", "--version"], @@ -47,8 +47,6 @@ def detect_launcher() -> str: ) if "pbs" in (qsub_version.stdout).lower(): return "pbs" - if "cobalt" in (qsub_version.stdout).lower(): - return "cobalt" if all( [which("bsub"), which("jsrun"), which("jslist"), which("bjobs"), which("bkill")] ): @@ -66,9 +64,7 @@ def detect_launcher() -> str: ): return "slurm" # Systems like ThetaGPU don't have - # Cobalt or PBS on compute nodes - if "COBALT_JOBID" in os.environ: - return "cobalt" + # PBS on compute nodes if "PBS_JOBID" in os.environ: return "pbs" return "local" diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 2b7db11e1..1b35730e3 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -58,8 +58,6 @@ def test_batch_model(fileutils, test_dir, wlmutils): batch_settings.set_account(wlmutils.get_test_account()) add_batch_resources(wlmutils, batch_settings) - if wlmutils.get_test_launcher() == "cobalt": - batch_settings.set_queue("debug-flat-quad") run_settings = wlmutils.get_run_settings("python", f"{script} --time=5") model = exp.create_model( "model", path=test_dir, run_settings=run_settings, batch_settings=batch_settings @@ -87,8 +85,6 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): add_batch_resources(wlmutils, batch) batch.set_account(wlmutils.get_test_account()) - if wlmutils.get_test_launcher() == "cobalt": - batch.set_queue("debug-flat-quad") ensemble = exp.create_ensemble("batch-ens", batch_settings=batch) ensemble.add_model(M1) ensemble.add_model(M2) @@ -110,12 +106,6 @@ def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): add_batch_resources(wlmutils, batch) batch.set_account(wlmutils.get_test_account()) - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - sleep(30) - batch.set_queue("debug-flat-quad") ensemble = exp.create_ensemble( "batch-ens-replicas", batch_settings=batch, run_settings=settings, replicas=2 ) diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index f1f5952b3..6e445c298 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -60,9 +60,6 @@ def test_launch_orc_auto_batch(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:02:00") - if wlmutils.get_test_launcher() == "cobalt": - orc.batch_settings.set_queue("debug-flat-quad") - orc.set_path(test_dir) exp.start(orc, block=True) @@ -99,12 +96,6 @@ def test_launch_cluster_orc_batch_single(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:02:00") - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - time.sleep(120) - orc.batch_settings.set_queue("debug-flat-quad") orc.set_path(test_dir) exp.start(orc, block=True) @@ -141,12 +132,6 @@ def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:03:00") - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - time.sleep(120) - orc.batch_settings.set_queue("debug-flat-quad") orc.set_path(test_dir) exp.start(orc, block=True) @@ -178,12 +163,6 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:03:00") - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - time.sleep(120) - orc.batch_settings.set_queue("debug-flat-quad") exp.start(orc, block=True) diff --git a/tests/full_wlm/test_mpmd.py b/tests/full_wlm/test_mpmd.py index 18e918cfd..576c2628f 100644 --- a/tests/full_wlm/test_mpmd.py +++ b/tests/full_wlm/test_mpmd.py @@ -48,9 +48,9 @@ def test_mpmd(fileutils, test_dir, wlmutils): exp_name = "test-mpmd" launcher = wlmutils.get_test_launcher() # MPMD is supported in LSF, but the test for it is different - mpmd_supported = ["slurm", "pbs", "cobalt"] + mpmd_supported = ["slurm", "pbs"] if launcher not in mpmd_supported: - pytest.skip("Test requires Slurm, PBS, or Cobalt to run") + pytest.skip("Test requires Slurm, or PBS to run") # aprun returns an error if the launched app is not an MPI exec # as we do not want to add mpi4py as a dependency, we prefer to @@ -58,7 +58,6 @@ def test_mpmd(fileutils, test_dir, wlmutils): by_launcher = { "slurm": ["srun", "mpirun"], "pbs": ["mpirun"], - "cobalt": ["mpirun"], } exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) diff --git a/tests/install/test_builder.py b/tests/install/test_builder.py index e6d030133..446cdf776 100644 --- a/tests/install/test_builder.py +++ b/tests/install/test_builder.py @@ -25,14 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pytest - import functools import pathlib import platform import threading import time +import pytest + import smartsim._core._install.builder as build # The tests in this file belong to the group_a group diff --git a/tests/on_wlm/test_simple_base_settings_on_wlm.py b/tests/on_wlm/test_simple_base_settings_on_wlm.py index 08bf875e2..42186bc89 100644 --- a/tests/on_wlm/test_simple_base_settings_on_wlm.py +++ b/tests/on_wlm/test_simple_base_settings_on_wlm.py @@ -50,10 +50,8 @@ def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() - if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: - pytest.skip( - "Test only runs on systems with LSF, PBSPro, Slurm, or Cobalt as WLM" - ) + if launcher not in ["pbs", "slurm", "lsf"]: + pytest.skip("Test only runs on systems with LSF, PBSPro, or Slurm as WLM") exp_name = "test-simplebase-settings-model-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) @@ -70,10 +68,8 @@ def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): def test_simple_model_stop_on_wlm(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() - if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: - pytest.skip( - "Test only runs on systems with LSF, PBSPro, Slurm, or Cobalt as WLM" - ) + if launcher not in ["pbs", "slurm", "lsf"]: + pytest.skip("Test only runs on systems with LSF, PBSPro, or Slurm as WLM") exp_name = "test-simplebase-settings-model-stop" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) diff --git a/tests/test_cobalt_parser.py b/tests/test_cobalt_parser.py deleted file mode 100644 index e91c95100..000000000 --- a/tests/test_cobalt_parser.py +++ /dev/null @@ -1,54 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -from smartsim._core.launcher.cobalt import cobaltParser - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - - -def test_parse_step_id(): - output = "JobName JobId \n" "=====================\n" "smartsim 507975 \n" - step_id = cobaltParser.parse_cobalt_step_id(output, "smartsim") - assert step_id == "507975" - - -def test_parse_step_status(): - output = "JobName State \n" "=====================\n" "smartsim running \n" - step_id = cobaltParser.parse_cobalt_step_status(output, "smartsim") - assert step_id == "running" - - -def test_parse_qsub_out(): - output = ( - "Job routed to queue 'debug-flat-quad'.\n" - "Memory mode set to flat quad for queue debug-flat-quad\n" - "507998\n" - ) - step_id = cobaltParser.parse_qsub_out(output) - assert step_id == "507998" diff --git a/tests/test_configs/cov/local_cov.cfg b/tests/test_configs/cov/local_cov.cfg index d25e9f83a..481cc08c1 100644 --- a/tests/test_configs/cov/local_cov.cfg +++ b/tests/test_configs/cov/local_cov.cfg @@ -2,7 +2,6 @@ omit = *pbs* *slurm* - *cobalt* *mpirun* *alps* *lsf* @@ -39,7 +38,6 @@ exclude_lines= pragma: no cover cov-pbs cov-slurm - cov-cobalt cov-alps cov-mpirun cov-wlm @@ -49,6 +47,5 @@ exclude_lines= launcher == "slurm" launcher == "pbs" - launcher == "cobalt" launcher == "lsf" launcher == "pals" diff --git a/tests/test_configs/cov/lsf_cov.cfg b/tests/test_configs/cov/lsf_cov.cfg index 6e5f52eb4..03b27c5ec 100644 --- a/tests/test_configs/cov/lsf_cov.cfg +++ b/tests/test_configs/cov/lsf_cov.cfg @@ -2,7 +2,6 @@ omit = *slurm* *local* - *cobalt* *pbs* *alps* *redis_starter.py* @@ -36,11 +35,9 @@ exclude_lines= cov-slurm cov-local - cov-cobalt cov-alps cov-pbs pass launcher == "local" launcher == "slurm" - launcher == "cobalt" launcher == "pbs" diff --git a/tests/test_configs/cov/pbs_cov.cfg b/tests/test_configs/cov/pbs_cov.cfg index 99e7bcfd6..f9274cbf6 100644 --- a/tests/test_configs/cov/pbs_cov.cfg +++ b/tests/test_configs/cov/pbs_cov.cfg @@ -2,7 +2,6 @@ omit = *slurm* *local* - *cobalt* *mpirun* *alps* *lsf* @@ -37,11 +36,9 @@ exclude_lines= cov-slurm cov-local - cov-cobalt cov-alps cov-lsf pass launcher == "local" launcher == "slurm" - launcher == "cobalt" launcher == "lsf" diff --git a/tests/test_configs/cov/slurm_cov.cfg b/tests/test_configs/cov/slurm_cov.cfg index 59405bc35..5aa77cfbe 100644 --- a/tests/test_configs/cov/slurm_cov.cfg +++ b/tests/test_configs/cov/slurm_cov.cfg @@ -2,7 +2,6 @@ omit = *pbs* *local* - *cobalt* *mpirun* *alps* *lsf* @@ -37,11 +36,9 @@ exclude_lines= cov-pbs cov-local - cov-cobalt cov-alps cov-lsf pass launcher == "local" launcher == "pbs" - launcher == "cobalt" launcher == "lsf" diff --git a/tests/test_controller.py b/tests/test_controller.py index 65687ec59..85c90b1c3 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -48,7 +48,8 @@ class MockStep(Step): def _create_unique_name(name): return name - def add_to_batch(self, step): ... + def add_to_batch(self, step): + ... def get_launch_cmd(self): return [] diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index ec0ed23ea..0629fb60d 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -93,7 +93,9 @@ def test_launched_shard_info_can_be_serialized(): @pytest.mark.parametrize("limit", [None, 1]) def test_db_node_can_parse_launched_shard_info(limit): rand_shards = [_random_shard_info() for _ in range(3)] - with io.StringIO(textwrap.dedent("""\ + with io.StringIO( + textwrap.dedent( + """\ This is some file like str -------------------------- @@ -108,7 +110,9 @@ def test_db_node_can_parse_launched_shard_info(limit): SMARTSIM_ORC_SHARD_INFO: {} All other lines should be ignored. - """).format(*(json.dumps(s.to_dict()) for s in rand_shards))) as stream: + """ + ).format(*(json.dumps(s.to_dict()) for s in rand_shards)) + ) as stream: parsed_shards = DBNode._parse_launched_shard_info_from_iterable(stream, limit) if limit is not None: rand_shards = rand_shards[:limit] diff --git a/tests/test_experiment.py b/tests/test_experiment.py index c0185ab6d..139a61a99 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -31,6 +31,7 @@ from smartsim._core.config import CONFIG from smartsim.entity import Model from smartsim.error import SmartSimError +from smartsim.error.errors import SSUnsupportedError from smartsim.settings import RunSettings from smartsim.status import STATUS_NEVER_STARTED @@ -178,3 +179,8 @@ def test_enable_disable_telemtery(monkeypatch): assert CONFIG.telemetry_enabled exp.disable_telemetry() assert not CONFIG.telemetry_enabled + + +def test_error_on_cobalt(): + with pytest.raises(SSUnsupportedError): + exp = Experiment("cobalt_exp", launcher="cobalt") diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 4a1b08367..74ac21b81 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -256,54 +256,6 @@ def test_orc_results_in_correct_number_of_shards(single_cmd): ) -###### Cobalt ###### - - -def test_cobalt_set_run_arg(wlmutils): - orc = Orchestrator( - wlmutils.get_test_port(), - db_nodes=3, - batch=False, - interface="lo", - launcher="cobalt", - run_command="aprun", - ) - orc.set_run_arg("account", "ACCOUNT") - assert all( - [db.run_settings.run_args["account"] == "ACCOUNT" for db in orc.entities] - ) - orc.set_run_arg("pes-per-numa-node", "2") - assert all( - ["pes-per-numa-node" not in db.run_settings.run_args for db in orc.entities] - ) - - -def test_cobalt_set_batch_arg(wlmutils): - orc = Orchestrator( - wlmutils.get_test_port(), - db_nodes=3, - batch=False, - interface="lo", - launcher="cobalt", - run_command="aprun", - ) - with pytest.raises(SmartSimError): - orc.set_batch_arg("account", "ACCOUNT") - - orc2 = Orchestrator( - wlmutils.get_test_port(), - db_nodes=3, - batch=True, - interface="lo", - launcher="cobalt", - run_command="aprun", - ) - orc2.set_batch_arg("account", "ACCOUNT") - assert orc2.batch_settings.batch_args["account"] == "ACCOUNT" - orc2.set_batch_arg("outputprefix", "new_output/") - assert "outputprefix" not in orc2.batch_settings.batch_args - - ###### LSF ###### diff --git a/tests/test_run_settings.py b/tests/test_run_settings.py index 7bcd6d874..26562c825 100644 --- a/tests/test_run_settings.py +++ b/tests/test_run_settings.py @@ -97,7 +97,7 @@ def test_create_run_settings_local(): id=f"{l}/orterun", ), ) - for l in ("local", "pbs", "slurm", "lsf", "cobalt") + for l in ("local", "pbs", "slurm", "lsf") ) ), # Except for launchers that implement their own MPI settings diff --git a/tutorials/getting_started/getting_started.ipynb b/tutorials/getting_started/getting_started.ipynb index a80443564..0a5230b0f 100644 --- a/tutorials/getting_started/getting_started.ipynb +++ b/tutorials/getting_started/getting_started.ipynb @@ -36,7 +36,6 @@ "The `Experiment` also needs to have a `launcher` specified. Launchers provide SmartSim the ability to construct and execute complex workloads on HPC systems with schedulers (workload managers) like Slurm, or PBS. SmartSim currently supports\n", " * `slurm`\n", " * `pbs`\n", - " * `cobalt`\n", " * `lsf`\n", " * `local` (single node/laptops)\n", " * `auto`\n", @@ -809,7 +808,7 @@ "module = torch.jit.trace(net, example_forward_input)\n", "\n", "# Save the traced model to a file\n", - "torch.jit.save(module, \"./torch_cnn.pt\") " + "torch.jit.save(module, \"./torch_cnn.pt\")" ] }, { @@ -982,7 +981,7 @@ "source": [ "rs_prod = exp.create_run_settings(\"python\", f\"producer.py --redis-port {REDIS_PORT}\")\n", "ensemble = exp.create_ensemble(name=\"producer\",\n", - " replicas=2, \n", + " replicas=2,\n", " run_settings=rs_prod)" ] },