diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index 288561c39..714eb3819 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -1,7 +1,7 @@ # # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -39,11 +39,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: ref: doc path: doc-branch diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0b9b973c3..ad9a55e03 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,7 +1,7 @@ # # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -56,8 +56,8 @@ jobs: os: [ubuntu-20.04, macos-12] steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 - name: Install cibuildwheel run: python -m pip install cibuildwheel>=2.12.3 @@ -93,9 +93,9 @@ jobs: name: Build source distribution runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 name: Install Python with: python-version: '3.8' diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index fc817cba9..79466b902 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -1,7 +1,7 @@ # # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,6 +34,10 @@ on: branches: - develop +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + env: HOMEBREW_NO_ANALYTICS: "ON" # Make Homebrew installation a little quicker HOMEBREW_NO_AUTO_UPDATE: "ON" @@ -53,15 +57,14 @@ jobs: os: [macos-12, ubuntu-20.04] # Operating systems compiler: [8] # GNU compiler version rai: [1.2.7] # Redis AI versions - py_v: [3.8, 3.9, '3.10'] # Python versions - + py_v: ['3.8', '3.9', '3.10', '3.11'] # Python versions env: SMARTSIM_REDISAI: ${{ matrix.rai }} steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.py_v }} @@ -101,19 +104,12 @@ jobs: # on developments of the client are brought in. - name: Install SmartSim (with ML backends) run: | - python -m pip install git+https://github.com/CrayLabs/SmartRedis.git@develop#egg=smartredis python -m pip install .[dev,ml] - - name: Install ML Runtimes with Smart (with pt, tf, and onnx support) - if: (matrix.py_v != '3.10') run: smart build --device cpu --onnx -v - - name: Install ML Runtimes with Smart (with pt and tf support) - if: (matrix.py_v == '3.10') - run: smart build --device cpu -v - - name: Run mypy run: | python -m pip install .[mypy] @@ -122,6 +118,15 @@ jobs: - name: Run Pylint run: make check-lint + # Run isort/black style check + - name: Run isort + run: isort --check-only --profile black ./smartsim ./tests + + # Run isort/black style check + - name: Run black + run: | + black --exclude smartsim/version.py --check ./smartsim ./tests + # Run pytest (backends subdirectory) - name: Run Pytest if: (matrix.subset == 'backends') @@ -151,7 +156,7 @@ jobs: retention-days: 5 - name: Upload Pytest coverage to Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v3.1.4 with: fail_ci_if_error: false files: ./coverage.xml diff --git a/.pylintrc b/.pylintrc index da0886ba2..f2fa17bab 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/.wci.yml b/.wci.yml index 55b5ddda1..fd4ae0c1c 100644 --- a/.wci.yml +++ b/.wci.yml @@ -10,7 +10,7 @@ Machine Learning (ML) libraries, like PyTorch and TensorFlow, in combination with High Performance Computing (HPC) simulations and applications. SmartSim launches ML infrastructure on HPC systems alongside user workloads - and supports most HPC workload managers (e.g. Slurm, PBSPro, LSF, Cobalt). + and supports most HPC workload managers (e.g. Slurm, PBSPro, LSF). SmartSim also provides a set of client libraries in Python, C++, C, and Fortran. These client libraries allow users to send and receive data between user applications and the machine learning infrastructure. Moreover, the @@ -22,8 +22,8 @@ language: Python release: - version: 0.6.0 - date: 2023-12-18 + version: 0.6.1 + date: 2024-02-15 documentation: general: https://www.craylabs.org/docs/overview.html @@ -41,7 +41,6 @@ - Slurm - PBSPro - LSF - - Cobalt - Linux/MacOS transfer_protocols: - TCP/IP diff --git a/LICENSE.md b/LICENSE.md index 9312d5762..7e5e1594b 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2021-2023, Hewlett Packard Enterprise +Copyright (c) 2021-2024, Hewlett Packard Enterprise All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/Makefile b/Makefile index fef69eab3..d8a2f0e6b 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -66,6 +66,10 @@ clobber: clean # help: # help: Style # help: ------- +# help: check-all - Performs all the style-related checks +.PHONY: check-all +check-all: check-style check-format check-sort-imports check-lint check-mypy + $(info All style checks PASSED) # help: style - Sort imports and format with black .PHONY: style @@ -146,11 +150,11 @@ tutorials-dev: @docker compose build tutorials-dev @docker run -p 8888:8888 smartsim-tutorials:dev-latest -# help: tutorials-prod - Build and start a docker container to run the tutorials (v0.6.0) +# help: tutorials-prod - Build and start a docker container to run the tutorials (v0.6.1) .PHONY: tutorials-prod tutorials-prod: @docker compose build tutorials-prod - @docker run -p 8888:8888 smartsim-tutorials:v0.6.0 + @docker run -p 8888:8888 smartsim-tutorials:v0.6.1 # help: diff --git a/README.md b/README.md index df671ef02..cfd8d4271 100644 --- a/README.md +++ b/README.md @@ -100,8 +100,8 @@ before using it on your system. Each tutorial is a Jupyter notebook that can be which will run a jupyter lab with the tutorials, SmartSim, and SmartRedis installed. ```bash -docker pull ghcr.io/craylabs/smartsim-tutorials:v0.4.1 -docker run -p 8888:8888 ghcr.io/craylabs/smartsim-tutorials:v0.4.1 +docker pull ghcr.io/craylabs/smartsim-tutorials:latest +docker run -p 8888:8888 ghcr.io/craylabs/smartsim-tutorials:latest # click on link to open jupyter lab ``` @@ -179,7 +179,6 @@ launch capabilities for all applications. - Slurm - LSF - PBSPro - - Cobalt - Local (for laptops/single node, no batch) @@ -198,7 +197,7 @@ qsub -l select=3:ncpus=20 -l walltime=00:10:00 -l place=scatter -I -q bsub -Is -W 00:10 -nnodes 3 -P $SHELL ``` -This same script will run on a SLURM, PBS, LSF, or Cobalt system as the ``launcher`` +This same script will run on a SLURM, PBS, or LSF system as the ``launcher`` is set to `auto` in the [Experiment](https://www.craylabs.org/docs/api/smartsim_api.html#experiment) initialization. The run command like ``mpirun``, ``aprun`` or ``srun`` will be automatically detected from what is available on the @@ -277,8 +276,8 @@ print(exp.get_status(ensemble)) python hello_ensemble.py ``` -Similar to the interactive example, this same script will run on a SLURM, PBS, LSF, -or Cobalt system as the ``launcher`` is set to `auto` in the +Similar to the interactive example, this same script will run on a SLURM, PBS, +or LSF system as the ``launcher`` is set to `auto` in the [Experiment](https://www.craylabs.org/docs/api/smartsim_api.html#experiment) initialization. Local launching does not support batch workloads. @@ -452,8 +451,8 @@ Each tutorial is a Jupyter notebook that can be run through the which will run a jupyter lab with the tutorials, SmartSim, and SmartRedis installed. ```bash -docker pull ghcr.io/craylabs/smartsim-tutorials:v1 -docker run -p 8888:8888 ghcr.io/craylabs/smartsim-tutorials:v0.4.1 +docker pull ghcr.io/craylabs/smartsim-tutorials:latest +docker run -p 8888:8888 ghcr.io/craylabs/smartsim-tutorials:latest ``` Each of the following examples can be found in the [SmartSim documentation](https://www.craylabs.org/docs/tutorials/getting_started/getting_started.html). @@ -640,15 +639,15 @@ from C, C++, Fortran and Python with the SmartRedis Clients: 1.2.7 PyTorch - 1.11.x + 2.0.1 TensorFlow\Keras - 2.8.x + 2.13.1 ONNX - 1.11.x + 1.14.1 diff --git a/conftest.py b/conftest.py index ff4e56ee1..b5a4fd70b 100644 --- a/conftest.py +++ b/conftest.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -101,7 +101,7 @@ def print_test_configuration() -> None: def pytest_configure() -> None: pytest.test_launcher = test_launcher - pytest.wlm_options = ["slurm", "pbs", "cobalt", "lsf", "pals"] + pytest.wlm_options = ["slurm", "pbs", "lsf", "pals"] account = get_account() pytest.test_account = account pytest.test_device = test_device @@ -153,12 +153,7 @@ def kill_all_test_spawned_processes() -> None: def get_hostlist() -> t.Optional[t.List[str]]: global test_hostlist if not test_hostlist: - if "COBALT_NODEFILE" in os.environ: - try: - return _parse_hostlist_file(os.environ["COBALT_NODEFILE"]) - except FileNotFoundError: - return None - elif "PBS_NODEFILE" in os.environ and test_launcher == "pals": + if "PBS_NODEFILE" in os.environ and test_launcher == "pals": # with PALS, we need a hostfile even if `aprun` is available try: return _parse_hostlist_file(os.environ["PBS_NODEFILE"]) @@ -269,19 +264,6 @@ def get_base_run_settings( run_args = {"--np": ntasks, "--hostfile": host_file} run_args.update(kwargs) return RunSettings(exe, args, run_command="mpiexec", run_args=run_args) - if test_launcher == "cobalt": - if shutil.which("aprun"): - run_command = "aprun" - run_args = {"--pes": ntasks} - else: - run_command = "mpirun" - host_file = os.environ["COBALT_NODEFILE"] - run_args = {"-n": ntasks, "--hostfile": host_file} - run_args.update(kwargs) - settings = RunSettings( - exe, args, run_command=run_command, run_args=run_args - ) - return settings if test_launcher == "lsf": run_args = {"--np": ntasks, "--nrs": nodes} run_args.update(kwargs) @@ -289,7 +271,7 @@ def get_base_run_settings( return settings if test_launcher != "local": raise SSConfigError( - "Base run settings are available for Slurm, PBS, Cobalt, " + "Base run settings are available for Slurm, PBS, " f"and LSF, but launcher was {test_launcher}" ) # TODO allow user to pick aprun vs MPIrun @@ -320,18 +302,6 @@ def get_run_settings( run_args = {"np": ntasks, "hostfile": host_file} run_args.update(kwargs) return PalsMpiexecSettings(exe, args, run_args=run_args) - # TODO allow user to pick aprun vs MPIrun - if test_launcher == "cobalt": - if shutil.which("aprun"): - run_args = {"pes": ntasks} - run_args.update(kwargs) - return AprunSettings(exe, args, run_args=run_args) - - host_file = os.environ["COBALT_NODEFILE"] - run_args = {"n": ntasks, "hostfile": host_file} - run_args.update(kwargs) - return MpirunSettings(exe, args, run_args=run_args) - if test_launcher == "lsf": run_args = { "nrs": nodes, @@ -344,7 +314,7 @@ def get_run_settings( @staticmethod def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: - if test_launcher in ["pbs", "cobalt"]: + if test_launcher == "pbs": if not shutil.which("aprun"): hostlist = get_hostlist() else: @@ -698,3 +668,7 @@ def setup_test_colo( assert colo_model.colocated # Check to make sure that limit_db_cpus made it into the colo settings return colo_model + +@pytest.fixture +def config() -> smartsim._core.config.Config: + return CONFIG diff --git a/doc/_static/custom_tab_style.css b/doc/_static/custom_tab_style.css new file mode 100644 index 000000000..f31e13667 --- /dev/null +++ b/doc/_static/custom_tab_style.css @@ -0,0 +1,7 @@ +.sphinx-tabs-panel { + background-color: inherit; +} + +.sphinx-tabs-tab[aria-selected="true"] { + background-color: inherit; +} \ No newline at end of file diff --git a/doc/_static/version_names.json b/doc/_static/version_names.json index 8ae78ebdb..7b49ea2cc 100644 --- a/doc/_static/version_names.json +++ b/doc/_static/version_names.json @@ -1,7 +1,8 @@ { "version_names":[ "develop (unstable)", - "0.6.0 (stable)", + "0.6.1 (stable)", + "0.6.0", "0.5.1", "0.5.0", "0.4.2", @@ -12,6 +13,7 @@ "version_urls": [ "https://www.craylabs.org/develop/overview.html", "https://www.craylabs.org/docs/overview.html", + "https://www.craylabs.org/docs/versions/0.6.0/overview.html", "https://www.craylabs.org/docs/versions/0.5.1/overview.html", "https://www.craylabs.org/docs/versions/0.5.0/overview.html", "https://www.craylabs.org/docs/versions/0.4.2/overview.html", diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index 5136c8aa5..adf7081ec 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -43,8 +43,8 @@ Settings are provided to ``Model`` and ``Ensemble`` objects to provide parameters for how a job should be executed. Some are specifically meant for certain launchers like ``SbatchSettings`` is solely meant for system using Slurm as a workload manager. -``MpirunSettings`` for OpenMPI based jobs is supported by Slurm, -PBSPro, and Cobalt. +``MpirunSettings`` for OpenMPI based jobs is supported by Slurm +and PBSPro. Types of Settings: @@ -60,7 +60,6 @@ Types of Settings: JsrunSettings SbatchSettings QsubBatchSettings - CobaltBatchSettings BsubBatchSettings Settings objects can accept a container object that defines a container @@ -137,7 +136,7 @@ AprunSettings ``AprunSettings`` can be used on any system that supports the Cray ALPS layer. SmartSim supports using ``AprunSettings`` -on PBSPro and Cobalt WLM systems. +on PBSPro WLM systems. ``AprunSettings`` can be used in interactive session (on allocation) and within batch launches (e.g., ``QsubBatchSettings``) @@ -204,7 +203,7 @@ MpirunSettings ``MpirunSettings`` are for launching with OpenMPI. ``MpirunSettings`` are -supported on Slurm, PBSpro, and Cobalt. +supported on Slurm and PBSpro. .. autosummary:: @@ -231,7 +230,7 @@ MpiexecSettings ``MpiexecSettings`` are for launching with OpenMPI's ``mpiexec``. ``MpirunSettings`` are -supported on Slurm, PBSpro, and Cobalt. +supported on Slurm and PBSpro. .. autosummary:: @@ -258,7 +257,7 @@ OrterunSettings ``OrterunSettings`` are for launching with OpenMPI's ``orterun``. ``OrterunSettings`` are -supported on Slurm, PBSpro, and Cobalt. +supported on Slurm and PBSpro. .. autosummary:: @@ -336,32 +335,6 @@ be launched as a batch on PBSPro systems. :members: -.. _cqsub_api: - - -CobaltBatchSettings -------------------- - -``CobaltBatchSettings`` are used to configure jobs that should -be launched as a batch on Cobalt Systems. They closely mimic -that of the ``QsubBatchSettings`` for PBSPro. - - -.. autosummary:: - - CobaltBatchSettings.set_account - CobaltBatchSettings.set_batch_command - CobaltBatchSettings.set_nodes - CobaltBatchSettings.set_queue - CobaltBatchSettings.set_walltime - CobaltBatchSettings.format_batch_args - -.. autoclass:: CobaltBatchSettings - :inherited-members: - :undoc-members: - :members: - - .. _bsub_api: BsubBatchSettings diff --git a/doc/changelog.rst b/doc/changelog.rst index befb9ee37..e11455624 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -11,12 +11,118 @@ Jump to :ref:`SmartRedis Changelog ` SmartSim ======== +0.6.1 +----- + +Released on 15 February, 2024 + +Description + +- Duplicate for DBModel/Script prevented +- Update license to include 2024 +- Telemetry monitor is now active by default +- Add support for Mac OSX on Apple Silicon +- Remove Torch warnings during testing +- Validate Slurm timing format +- Expose Python Typehints +- Fix test_logs to prevent generation of directory +- Fix Python Typehint for colocated database settings +- Python 3.11 Support +- Quality of life `smart validate` improvements +- Remove Cobalt support +- Enrich logging through context variables +- Upgrade Machine Learning dependencies +- Override sphinx-tabs background color +- Add concurrency group to test workflow +- Fix index when installing torch through smart build -Development branch ------------------- -To be released at some future point in time +Detailed Notes +- Modify the `git clone` for both Redis and RedisAI to set the line endings to + unix-style line endings when using MacOS on ARM. (SmartSim-PR482_) +- Separate install instructions are now provided for Mac OSX on x64 vs ARM64 (SmartSim-PR479_) +- Prevent duplicate ML model and script names being added to an + Ensemble member if the names exists. (SmartSim-PR475_) +- Updates `Copyright (c) 2021-2023` to `Copyright (c) 2021-2024` + in all of the necessary files. (SmartSim-PR485_) +- Bug fix which prevents the expected behavior when the `SMARTSIM_LOG_LEVEL` + environment variable was set to `developer`. (SmartSim-PR473_) +- Sets the default value of the "enable telemetry" flag to on. + Bumps the output `manifest.json` version number to match that of + `smartdashboard` and pins a watchdog version to avoid build errors. + (SmartSim-PR477_) +- Refactor logic of `Manifest.has_db_objects` to remove excess branching + and improve readability/maintainability. (SmartSim-PR476_) +- SmartSim can now be built and used on platforms using Apple Silicon + (ARM64). Currently, only the PyTorch backend is supported. Note that libtorch + will be downloaded from a CrayLabs github repo. (SmartSim-PR465_) +- Tests that were saving Torch models were emitting warnings. These warnings + were addressed by updating the model save test function. (SmartSim-PR472_) +- Validate the timing format when requesting a slurm allocation. (SmartSim-PR471_) +- Add and ship `py.typed` marker to expose inline type hints. Fix + type errors related to SmartRedis. (SmartSim-PR468_) +- Fix the `test_logs.py::test_context_leak` test that was + erroneously creating a directory named `some value` in SmartSim's root + directory. (SmartSim-PR467_) +- Add Python type hinting to colocated settings. (SmartSim-PR462_) +- Add github actions for running black and isort checks. (SmartSim-PR464_) +- Relax the required version of `typing_extensions`. (SmartSim-PR459_) +- Addition of Python 3.11 to SmartSim. (SmartSim-PR461_) +- Quality of life `smart validate` improvements such as setting `CUDA_VISIBLE_DEVICES` + environment variable within `smart validate` prior to importing any ML deps to + prevent false negatives on multi-GPU systems. Additionally, move SmartRedis logs + from standard out to dedicated log file in the validation temporary directory as well as + suppress `sklearn` deprecation warning by pinning `KMeans` constructor + argument. Lastly, move TF test to last as TF may reserve the GPUs it uses. + (SmartSim-PR458_) +- Some actions in the current GitHub CI/CD workflows were outdated. They were + replaced with the latest versions. (SmartSim-PR446_) +- As the Cobalt workload manager is not used on any system we are aware of, + its support in SmartSim was terminated and classes such as `CobaltLauncher` have + been removed. (SmartSim-PR448_) +- Experiment logs are written to a file that can be read by the dashboard. (SmartSim-PR452_) +- Updated SmartSim's machine learning backends to PyTorch 2.0.1, Tensorflow + 2.13.1, ONNX 1.14.1, and ONNX Runtime 1.16.1. As a result of this change, + there is now an available ONNX wheel for use with Python 3.10, and wheels for + all of SmartSim's machine learning backends with Python 3.11. + (SmartSim-PR451_) (SmartSim-PR461_) +- The sphinx-tabs documentation extension uses a white background for the tabs component. + A custom CSS for those components to inherit the overall theme color has + been added. (SmartSim-PR453_) +- Add concurrency groups to GitHub's CI/CD workflows, preventing + multiple workflows from the same PR to be launched concurrently. + (SmartSim-PR439_) +- Torch changed their preferred indexing when trying to install + their provided wheels. Updated the `pip install` command within + `smart build` to ensure that the appropriate packages can be found. + (SmartSim-PR449_) + + +.. _SmartSim-PR485: https://github.com/CrayLabs/SmartSim/pull/485 +.. _SmartSim-PR482: https://github.com/CrayLabs/SmartSim/pull/482 +.. _SmartSim-PR479: https://github.com/CrayLabs/SmartSim/pull/479 +.. _SmartSim-PR477: https://github.com/CrayLabs/SmartSim/pull/477 +.. _SmartSim-PR476: https://github.com/CrayLabs/SmartSim/pull/476 +.. _SmartSim-PR475: https://github.com/CrayLabs/SmartSim/pull/475 +.. _SmartSim-PR473: https://github.com/CrayLabs/SmartSim/pull/473 +.. _SmartSim-PR472: https://github.com/CrayLabs/SmartSim/pull/472 +.. _SmartSim-PR471: https://github.com/CrayLabs/SmartSim/pull/471 +.. _SmartSim-PR468: https://github.com/CrayLabs/SmartSim/pull/468 +.. _SmartSim-PR467: https://github.com/CrayLabs/SmartSim/pull/467 +.. _SmartSim-PR465: https://github.com/CrayLabs/SmartSim/pull/465 +.. _SmartSim-PR464: https://github.com/CrayLabs/SmartSim/pull/464 +.. _SmartSim-PR462: https://github.com/CrayLabs/SmartSim/pull/462 +.. _SmartSim-PR461: https://github.com/CrayLabs/SmartSim/pull/461 +.. _SmartSim-PR459: https://github.com/CrayLabs/SmartSim/pull/459 +.. _SmartSim-PR458: https://github.com/CrayLabs/SmartSim/pull/458 +.. _SmartSim-PR453: https://github.com/CrayLabs/SmartSim/pull/453 +.. _SmartSim-PR452: https://github.com/CrayLabs/SmartSim/pull/452 +.. _SmartSim-PR451: https://github.com/CrayLabs/SmartSim/pull/451 +.. _SmartSim-PR449: https://github.com/CrayLabs/SmartSim/pull/449 +.. _SmartSim-PR448: https://github.com/CrayLabs/SmartSim/pull/448 +.. _SmartSim-PR446: https://github.com/CrayLabs/SmartSim/pull/446 +.. _SmartSim-PR439: https://github.com/CrayLabs/SmartSim/pull/439 0.6.0 ----- @@ -434,7 +540,7 @@ Expand Machine Learning Library Support: Expand Launcher Setting Options: - - Add ability to use base ``RunSettings`` on a Slurm, PBS, or Cobalt launchers (SmartSim-PR90_) + - Add ability to use base ``RunSettings`` on a Slurm, or PBS launchers (SmartSim-PR90_) - Add ability to use base ``RunSettings`` on LFS launcher (SmartSim-PR108_) Deprecations and Breaking Changes diff --git a/doc/conf.py b/doc/conf.py index 908b9534f..d5b6f21da 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -19,14 +19,14 @@ # -- Project information ----------------------------------------------------- project = 'SmartSim' -copyright = '2021-2023, Hewlett Packard Enterprise' +copyright = '2021-2024, Hewlett Packard Enterprise' author = 'Cray Labs' try: import smartsim version = smartsim.__version__ except ImportError: - version = "0.6.0" + version = "0.6.1" # The full version, including alpha/beta/rc tags release = version @@ -100,6 +100,11 @@ "extra_footer": extra_footer, } +# Use a custom style sheet to avoid the sphinx-tabs extension from using +# white background with dark themes. If sphinx-tabs updates its +# static/tabs.css, this may need to be updated. +html_css_files = ['custom_tab_style.css'] + autoclass_content = 'both' add_module_names = False diff --git a/doc/developer.rst b/doc/developer.rst index 4009819c3..632ee8d45 100644 --- a/doc/developer.rst +++ b/doc/developer.rst @@ -84,14 +84,14 @@ Local ===== There are two levels of testing in SmartSim. The first runs by default and does -not launch any jobs out onto a system through a workload manager like Cobalt. +not launch any jobs out onto a system through a workload manager like Slurm. If any of the above commands are used, the test suite will run the "light" test suite by default. -PBSPro, Slurm, Cobalt, LSF -========================== +PBSPro, Slurm, LSF +================== To run the full test suite, users will have to be on a system with one of the above workload managers. Additionally, users will need to obtain an allocation @@ -105,9 +105,6 @@ of at least 3 nodes. # for PBSPro (with aprun) qsub -l select=3 -l place=scatter -l walltime=00:10:00 -q queue - # for Cobalt (with aprun) - qsub -n 3 -t 00:10:00 -A account -q queue -I - # for LSF (with jsrun) bsub -Is -W 00:30 -nnodes 3 -P project $SHELL @@ -117,7 +114,6 @@ Once in an iterative allocation, users will need to set the test launcher environment variable: ``SMARTSIM_TEST_LAUNCHER`` to one of the following values - slurm - - cobalt - pbs - lsf - local diff --git a/doc/experiment.rst b/doc/experiment.rst index f7950d6d6..986db4cad 100644 --- a/doc/experiment.rst +++ b/doc/experiment.rst @@ -38,8 +38,8 @@ available compute resources on the system. Each launcher supports specific types of ``RunSettings``. - :ref:`SrunSettings ` for Slurm - - :ref:`AprunSettings ` for PBSPro and Cobalt - - :ref:`MpirunSettings ` for OpenMPI with `mpirun` on PBSPro, Cobalt, LSF, and Slurm + - :ref:`AprunSettings ` for PBSPro + - :ref:`MpirunSettings ` for OpenMPI with `mpirun` on PBSPro, LSF, and Slurm - :ref:`JsrunSettings ` for LSF These settings can be manually specified by the user, or auto-detected by the @@ -181,7 +181,6 @@ workload manager and available compute resources. - :ref:`SbatchSettings ` for Slurm - :ref:`QsubBatchSettings ` for PBSPro - - :ref:`CobaltBatchSettings ` for Cobalt - :ref:`BsubBatchSettings ` for LSF If it only passed ``RunSettings``, ``Ensemble``, objects will require either diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst index 3874eb961..2f43db50f 100644 --- a/doc/installation_instructions/basic.rst +++ b/doc/installation_instructions/basic.rst @@ -18,7 +18,7 @@ Basic The base prerequisites to install SmartSim and SmartRedis are: - - Python 3.8-3.10 + - Python 3.8-3.11 - Pip - Cmake 3.13.x (or later) - C compiler @@ -33,6 +33,13 @@ The base prerequisites to install SmartSim and SmartRedis are: GCC 5-9, 11, and 12 is recommended. There are known bugs with GCC 10. +.. warning:: + + Apple Clang 15 seems to have issues on MacOS with Apple Silicon. Please modify + your path to ensure that a version of GCC installed by brew has priority. Note + this seems to be hardcoded to `gcc` and `g++` in the Redis build so ensure that + `which gcc g++` do not point to Apple Clang. + GPU Support =========== @@ -41,7 +48,7 @@ The machine-learning backends have additional requirements in order to use GPUs for inference - `CUDA Toolkit 11 (tested with 11.8) `_ - - `cuDNN 8 (tested with 8.2.1 and 8.4.0) `_ + - `cuDNN 8 (tested with 8.9.1) `_ - OS: Linux - GPU: Nvidia @@ -63,19 +70,20 @@ Supported Versions - GPU - Python Versions * - MacOS - - x86_64 + - x86_64, aarch64 - Not supported - - 3.8 - 3.10 + - 3.8 - 3.11 * - Linux - x86_64 - Nvidia - - 3.8 - 3.10 + - 3.8 - 3.11 .. note:: - Windows is not supported and there are currently no plans - to support Windows. + Users have succesfully run SmartSim on Windows using Windows Subsystem for Linux + with Nvidia support. Generally, users should follow the Linux instructions here, + however we make no guarantee or offer of support. Native support for various machine learning libraries and their @@ -84,15 +92,19 @@ versions is dictated by our dependency on RedisAI_ 1.2.7. +------------------+----------+-------------+---------------+ | RedisAI | PyTorch | Tensorflow | ONNX Runtime | +==================+==========+=============+===============+ -| 1.2.7 (default) | 1.11.0 | 2.8.0 | 1.11.1 | +| 1.2.7 (default) | 2.0.1 | 2.13.1 | 1.16.3 | +------------------+----------+-------------+---------------+ +.. warning:: + + On Apple Silicon, only the PyTorch backend is supported for now. Please contact us + if you need support for other backends + TensorFlow_ 2.0 and Keras_ are supported through `graph freezing`_. ScikitLearn_ and Spark_ models are supported by SmartSim as well through the use of the ONNX_ runtime (which is not built by -default due to issues with glibc on a variety of Linux -platforms and lack of support for Mac OS X). +default due to issues with glibc on a variety of Linux platforms). .. _Spark: https://spark.apache.org/mllib/ .. _Keras: https://keras.io @@ -106,7 +118,7 @@ platforms and lack of support for Mac OS X). ------------------------------------------------------------ MacOS-only -========== +============ We recommend users and contributors install brew_ for managing installed packages. For contributors, the following brew packages can be helpful: @@ -242,9 +254,9 @@ SmartSim does. * - Platform - Python Versions * - MacOS - - 3.7 - 3.10 + - 3.8 - 3.11 * - Linux - - 3.7 - 3.10 + - 3.8 - 3.11 The Python client for SmartRedis is installed through ``pip`` as follows: @@ -275,6 +287,7 @@ First, clone SmartSim. git clone https://github.com/CrayLabs/SmartSim smartsim + And then install SmartSim with pip in *editable* mode. This way, SmartSim is installed in your virtual environment and available on `sys.path`, but the source remains at the site of the clone instead of in site-packages. @@ -287,12 +300,29 @@ source remains at the site of the clone instead of in site-packages. Use the now installed ``smart`` cli to install the machine learning runtimes. -.. code-block:: bash +.. tabs:: + + .. tab:: Linux + + .. code-block:: bash + + # run one of the following + smart build --device cpu --onnx # install with cpu-only support + smart build --device gpu --onnx # install with both cpu and gpu support + + + .. tab:: MacOS (Intel x64) + + .. code-block:: bash + + smart build --device cpu --onnx # install all backends (PT, TF, ONNX) on gpu + + + .. tab:: MacOS (Apple Silicon) + + .. code-block:: bash - # run one of the following - smart build -v --device cpu # verbose install cpu - smart build -v --device gpu # verbose install gpu - smart build -v --device gpu --onnx # install all backends (PT, TF, ONNX) on gpu + smart build --device cpu --no_tf # Only install PyTorch (TF/ONNX unsupported) Build the SmartRedis library diff --git a/doc/installation_instructions/platform/olcf-summit.rst b/doc/installation_instructions/platform/olcf-summit.rst index dfd7b9666..422c771b4 100644 --- a/doc/installation_instructions/platform/olcf-summit.rst +++ b/doc/installation_instructions/platform/olcf-summit.rst @@ -19,7 +19,7 @@ into problems. .. code-block:: bash # setup Python and build environment - export ENV_NAME=smartsim-0.6.0 + export ENV_NAME=smartsim-0.6.1 git clone https://github.com/CrayLabs/SmartRedis.git smartredis git clone https://github.com/CrayLabs/SmartSim.git smartsim conda config --prepend channels https://ftp.osuosl.org/pub/open-ce/1.6.1/ diff --git a/doc/launchers.rst b/doc/launchers.rst index 7d0c9970f..22425071e 100644 --- a/doc/launchers.rst +++ b/doc/launchers.rst @@ -16,9 +16,8 @@ SmartSim currently supports 5 `launchers`: 1. ``local``: for single-node, workstation, or laptop 2. ``slurm``: for systems using the Slurm scheduler 3. ``pbs``: for systems using the PBSpro scheduler - 4. ``cobalt``: for systems using the Cobalt scheduler - 5. ``lsf``: for systems using the LSF scheduler - 6. ``auto``: have SmartSim auto-detect the launcher to use. + 4. ``lsf``: for systems using the LSF scheduler + 5. ``auto``: have SmartSim auto-detect the launcher to use. To specify a specific launcher, one argument needs to be provided to the ``Experiment`` initialization. @@ -30,7 +29,6 @@ to the ``Experiment`` initialization. exp = Experiment("name-of-experiment", launcher="local") # local launcher exp = Experiment("name-of-experiment", launcher="slurm") # Slurm launcher exp = Experiment("name-of-experiment", launcher="pbs") # PBSpro launcher - exp = Experiment("name-of-experiment", launcher="cobalt") # Cobalt launcher exp = Experiment("name-of-experiment", launcher="lsf") # LSF launcher exp = Experiment("name-of-experiment", launcher="auto") # auto-detect launcher @@ -219,42 +217,10 @@ creation. --------------------------------------------------------------------- -Cobalt -====== - -The Cobalt Launcher works just like the PBSPro launcher and -is compatible with ALPS and OpenMPI workloads as well. - -To use the Cobalt launcher, specify at ``Experiment`` initialization: - -.. code-block:: python - - from smartsim import Experiment - - exp = Experiment("MOM6-double-gyre", launcher="cobalt") - - -Running on Cobalt ------------------ - -The Cobalt launcher supports three types of ``RunSettings``: - 1. :ref:`AprunSettings ` - 2. :ref:`MpirunSettings ` - 3. :ref:`MpiexecSettings ` - -As well as batch settings for ``qsub`` through: - 1. :ref:`CobaltBatchSettings ` - -Both supported ``RunSettings`` types above can be added -to a ``CobaltBatchSettings`` batch workload through ``Ensemble`` -creation. - ---------------------------------------------------------------------- - LSF === -The LSF Launcher works like the PBSPro and Cobalt launchers and +The LSF Launcher works like the PBSPro launcher and is compatible with LSF and OpenMPI workloads. To use the LSF launcher, specify at ``Experiment`` initialization: diff --git a/doc/ml_features.rst b/doc/ml_features.rst index 51027e7ae..6096f005e 100644 --- a/doc/ml_features.rst +++ b/doc/ml_features.rst @@ -169,7 +169,7 @@ to the DB using the SmartRedis client. .. group-tab:: PyTorch - PyTorch requires models to be `jit-traced `__. + PyTorch requires models to be `jit-traced `__. The method ``torch.jit.save()`` can either store the model in memory or on file. Here, we will keep it in memory as a bytestring. @@ -239,7 +239,7 @@ it can be uploaded to the DB using the SmartRedis client. .. group-tab:: PyTorch - PyTorch requires models to be `jit-traced `__. + PyTorch requires models to be `jit-traced `__. The method ``torch.jit.save()`` can either store the model in memory or on file. Here, we will save it to a file located at ``./traced_model.pt``. diff --git a/doc/overview.rst b/doc/overview.rst index 3ef046bb0..241d54eca 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -61,8 +61,7 @@ The key features of the IL are: - An API to start, monitor, and stop HPC jobs from Python or from a Jupyter notebook. - Automated deployment of in-memory data staging (`Redis `_) and computational storage (`RedisAI `_). - - Programmatic launches of batch and in-allocation jobs on PBS, Slurm, LSF, - and Cobalt systems. + - Programmatic launches of batch and in-allocation jobs on PBS, Slurm, and LSF systems. - Creating and configuring ensembles of workloads with isolated communication channels. The IL can configure and launch batch jobs as well as jobs within interactive diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index 38d9c8052..e883a2805 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -6,9 +6,9 @@ sphinx-copybutton==0.5.2 sphinx-tabs==3.4.4 nbsphinx==0.9.3 docutils==0.18.1 -torch==1.11.0 -tensorflow==2.8.1 +torch==2.0.1 +tensorflow==2.13.1 ipython jinja2==3.1.2 protobuf -numpy \ No newline at end of file +numpy diff --git a/doc/testing.rst b/doc/testing.rst index bdaa473d7..ccb2db3c2 100644 --- a/doc/testing.rst +++ b/doc/testing.rst @@ -78,9 +78,6 @@ Examples of how to obtain allocations on systems with the launchers: # for PBSPro (with aprun) qsub -l select=4 -l place=scatter -l walltime=00:10:00 -q queue - # for Cobalt (with aprun) - qsub -n 4 -t 00:10:00 -A account -q queue -I - # for LSF (with jsrun) bsub -Is -W 00:30 -nnodes 4 -P project $SHELL @@ -91,7 +88,6 @@ launcher environment variable: ``SMARTSIM_TEST_LAUNCHER`` to one of the following values - slurm - - cobalt - pbs - lsf - local @@ -273,4 +269,3 @@ The actions are defined using yaml files are are located in the Each pull request, push and merge the test suite for SmartRedis and SmartSim are run. For SmartSim, this is the ``local`` test suite with the local launcher. - diff --git a/docker-compose.yml b/docker-compose.yml index c492e6324..f69743f14 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,7 +18,7 @@ services: - "8888:8888" tutorials-prod: - image: smartsim-tutorials:v0.4.2 + image: smartsim-tutorials:v0.6.1 build: context: . dockerfile: ./docker/prod/Dockerfile diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile index 6a5f82642..c643787c3 100644 --- a/docker/dev/Dockerfile +++ b/docker/dev/Dockerfile @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile index eee809910..eff99de36 100644 --- a/docker/docs/dev/Dockerfile +++ b/docker/docs/dev/Dockerfile @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/docker/prod/Dockerfile b/docker/prod/Dockerfile index 628d9af60..09e94dee0 100644 --- a/docker/prod/Dockerfile +++ b/docker/prod/Dockerfile @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,7 +46,7 @@ COPY --chown=craylabs:root ./tutorials/ /home/craylabs/tutorials/ USER craylabs RUN export PATH=/home/craylabs/.local/bin:$PATH && \ echo "export PATH=/home/craylabs/.local/bin:$PATH" >> /home/craylabs/.bashrc && \ - python -m pip install smartsim[ml]==0.6.0 jupyter jupyterlab matplotlib && \ + python -m pip install smartsim[ml]==0.6.1 jupyter jupyterlab matplotlib && \ smart build --device cpu -v && \ chown craylabs:root -R /home/craylabs/.local && \ rm -rf ~/.cache/pip diff --git a/docker/testing/Dockerfile b/docker/testing/Dockerfile index 1b59e0046..9c247c320 100644 --- a/docker/testing/Dockerfile +++ b/docker/testing/Dockerfile @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/pyproject.toml b/pyproject.toml index 60c33bee5..4415c63ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -91,18 +91,32 @@ disallow_untyped_defs = true disallow_incomplete_defs = true disallow_untyped_decorators = true +# Probably Unintended Branches/Blocks +# warn_unreachable = true + # Safety/Upgrading Mypy warn_unused_ignores = true warn_redundant_casts = true warn_unused_configs = true show_error_codes = true +# Misc Strictness Settings +strict_concatenate = false +strict_equality = true + +# Additional Error Codes +enable_error_code = [ + # "redundant-expr", + # "possibly-undefined", + # "unused-awaitable", + # "ignore-without-code", + # "mutable-override", +] + [[tool.mypy.overrides]] # Ignore packages that are not used or not typed module = [ "coloredlogs", - "smartredis", - "smartredis.error", "redis.cluster", "keras", "torch", diff --git a/setup.cfg b/setup.cfg index 49419c7eb..5fdfa82ae 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,6 +45,7 @@ classifiers = Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 License :: OSI Approved :: BSD License Intended Audience :: Science/Research Topic :: Scientific/Engineering @@ -55,7 +56,7 @@ setup_requires = setuptools>=39.2 cmake>=3.13 include_package_data = True -python_requires = >=3.8,<3.11 +python_requires = >=3.8,<3.12 [options.packages.find] include = @@ -67,5 +68,7 @@ exclude = smartredis [options.package_data] +smartsim = + py.typed smartsim._core.bin = * diff --git a/setup.py b/setup.py index 66a534456..bc7cf60d6 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -167,7 +167,7 @@ def has_ext_modules(_placeholder): "tqdm>=4.50.2", "filelock>=3.4.2", "protobuf~=3.20", - "watchdog>=3.0.0", + "watchdog>=3.0.0,<4.0.0", ] # Add SmartRedis at specific version @@ -190,6 +190,7 @@ def has_ext_modules(_placeholder): "types-tqdm", "types-tensorflow==2.12.0.9", "types-setuptools", + "typing_extensions>=4.1.0", ], # see smartsim/_core/_install/buildenv.py for more details **versions.ml_extras_required() diff --git a/smartsim/__init__.py b/smartsim/__init__.py index d3f5062b8..7c1fa2fe0 100644 --- a/smartsim/__init__.py +++ b/smartsim/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/__init__.py b/smartsim/_core/__init__.py index b835c7a0e..bbc108f48 100644 --- a/smartsim/_core/__init__.py +++ b/smartsim/_core/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/__init__.py b/smartsim/_core/_cli/__init__.py index bf6fd954c..efe03908e 100644 --- a/smartsim/_core/_cli/__init__.py +++ b/smartsim/_core/_cli/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/__main__.py b/smartsim/_core/_cli/__main__.py index 47df07048..66a50095a 100644 --- a/smartsim/_core/_cli/__main__.py +++ b/smartsim/_core/_cli/__main__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index 474d96c8a..443b916b7 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,6 +26,7 @@ import argparse import os +import platform import sys import typing as t from pathlib import Path @@ -60,14 +61,6 @@ def check_py_onnx_version(versions: Versioner) -> None: """Check Python environment for ONNX installation""" - if not versions.ONNX: - py_version = sys.version_info - msg = ( - "An onnx wheel is not available for " - f"Python {py_version.major}.{py_version.minor}. " - "Instead consider using Python 3.8 or 3.9 for ONNX 1.11 support" - ) - raise SetupError(msg) _check_packages_in_python_env( { "onnx": Version_(versions.ONNX), @@ -121,7 +114,12 @@ def build_database( # check database installation database_name = "KeyDB" if keydb else "Redis" database_builder = builder.DatabaseBuilder( - build_env(), build_env.MALLOC, build_env.JOBS, verbose + build_env(), + jobs=build_env.JOBS, + _os=builder.OperatingSystem.from_str(platform.system()), + architecture=builder.Architecture.from_str(platform.machine()), + malloc=build_env.MALLOC, + verbose=verbose, ) if not database_builder.is_built: logger.info( @@ -153,7 +151,7 @@ def build_redis_ai( backends_table = [ ["PyTorch", versions.TORCH, color_bool(use_torch)], ["TensorFlow", versions.TENSORFLOW, color_bool(use_tf)], - ["ONNX", versions.ONNX or "Unavailable", color_bool(use_onnx)], + ["ONNX", versions.ONNX, color_bool(use_onnx)], ] print(tabulate(backends_table, tablefmt="fancy_outline"), end="\n\n") print(f"Building for GPU support: {color_bool(device == 'gpu')}\n") @@ -181,12 +179,14 @@ def build_redis_ai( rai_builder = builder.RedisAIBuilder( build_env=build_env_dict, + jobs=build_env.JOBS, + _os=builder.OperatingSystem.from_str(platform.system()), + architecture=builder.Architecture.from_str(platform.machine()), torch_dir=str(torch_dir) if torch_dir else "", libtf_dir=str(libtf_dir) if libtf_dir else "", build_torch=use_torch, build_tf=use_tf, build_onnx=use_onnx, - jobs=build_env.JOBS, verbose=verbose, ) @@ -226,9 +226,10 @@ def build_redis_ai( logger.info("ML Backends and RedisAI build complete!") -def check_py_torch_version(versions: Versioner, device: _TDeviceStr = "cpu") -> None: +def check_py_torch_version(versions: Versioner, device_in: _TDeviceStr = "cpu") -> None: """Check Python environment for TensorFlow installation""" + device = device_in.lower() if BuildEnv.is_macos(): if device == "gpu": raise BuildError("SmartSim does not support GPU on MacOS") @@ -260,10 +261,11 @@ def check_py_torch_version(versions: Versioner, device: _TDeviceStr = "cpu") -> "Torch version not found in python environment. " "Attempting to install via `pip`" ) + wheel_device = device if device == "cpu" else device_suffix.replace("+", "") pip( "install", - "-f", - "https://download.pytorch.org/whl/torch_stable.html", + "--extra-index-url", + f"https://download.pytorch.org/whl/{wheel_device}", *(f"{package}=={version}" for package, version in torch_deps.items()), ) elif missing or conflicts: diff --git a/smartsim/_core/_cli/clean.py b/smartsim/_core/_cli/clean.py index d8a85f8a9..50e267d80 100644 --- a/smartsim/_core/_cli/clean.py +++ b/smartsim/_core/_cli/clean.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/cli.py b/smartsim/_core/_cli/cli.py index ef4c113e1..3cad573d1 100644 --- a/smartsim/_core/_cli/cli.py +++ b/smartsim/_core/_cli/cli.py @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/dbcli.py b/smartsim/_core/_cli/dbcli.py index ce0975bc4..733c2fe4d 100644 --- a/smartsim/_core/_cli/dbcli.py +++ b/smartsim/_core/_cli/dbcli.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/site.py b/smartsim/_core/_cli/site.py index c86e0341b..386f642c0 100644 --- a/smartsim/_core/_cli/site.py +++ b/smartsim/_core/_cli/site.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py index e31d0aed2..8bf0984df 100644 --- a/smartsim/_core/_cli/utils.py +++ b/smartsim/_core/_cli/utils.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index bda254859..8ea40ae00 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -25,13 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import contextlib import io import multiprocessing as mp import os +import os.path import socket import tempfile import typing as t -from contextlib import contextmanager from types import TracebackType import numpy as np @@ -52,8 +53,6 @@ if t.TYPE_CHECKING: - # Pylint disables needed for old version of pylint w/ TF 2.6.2 - # pylint: disable-next=unused-import from multiprocessing.connection import Connection # pylint: disable-next=unsubscriptable-object @@ -89,12 +88,23 @@ def execute( simple experiment """ backends = installed_redisai_backends() + device: _TCapitalDeviceStr = args.device.upper() try: - with _VerificationTempDir(dir=os.getcwd()) as temp_dir: + with contextlib.ExitStack() as ctx: + temp_dir = ctx.enter_context(_VerificationTempDir(dir=os.getcwd())) + validate_env = { + "SR_LOG_LEVEL": os.environ.get("SR_LOG_LEVEL", "INFO"), + "SR_LOG_FILE": os.environ.get( + "SR_LOG_FILE", os.path.join(temp_dir, "smartredis.log") + ), + } + if device == "GPU": + validate_env["CUDA_VISIBLE_DEVICES"] = "0" + ctx.enter_context(_env_vars_set_to(validate_env)) test_install( location=temp_dir, port=args.port, - device=args.device.upper(), + device=device, with_tf="tensorflow" in backends, with_pt="torch" in backends, with_onnx="onnxruntime" in backends, @@ -147,18 +157,40 @@ def test_install( logger.info("Verifying Tensor Transfer") client.put_tensor("plain-tensor", np.ones((1, 1, 3, 3))) client.get_tensor("plain-tensor") - if with_tf: - logger.info("Verifying TensorFlow Backend") - _test_tf_install(client, location, device) if with_pt: logger.info("Verifying Torch Backend") _test_torch_install(client, device) if with_onnx: logger.info("Verifying ONNX Backend") _test_onnx_install(client, device) + if with_tf: # Run last in case TF locks an entire GPU + logger.info("Verifying TensorFlow Backend") + _test_tf_install(client, location, device) + logger.info("Success!") -@contextmanager +@contextlib.contextmanager +def _env_vars_set_to( + evars: t.Mapping[str, t.Optional[str]] +) -> t.Generator[None, None, None]: + envvars = tuple((var, os.environ.pop(var, None), val) for var, val in evars.items()) + for var, _, tmpval in envvars: + _set_or_del_env_var(var, tmpval) + try: + yield + finally: + for var, origval, _ in reversed(envvars): + _set_or_del_env_var(var, origval) + + +def _set_or_del_env_var(var: str, val: t.Optional[str]) -> None: + if val is not None: + os.environ[var] = val + else: + os.environ.pop(var, None) + + +@contextlib.contextmanager def _make_managed_local_orc( exp: Experiment, port: int ) -> t.Generator[Client, None, None]: @@ -243,9 +275,18 @@ def __init__(self) -> None: def forward(self, x: torch.Tensor) -> torch.Tensor: return self.conv(x) + if device == "GPU": + device_ = torch.device("cuda") + else: + device_ = torch.device("cpu") + net = Net() - forward_input = torch.rand(1, 1, 3, 3) + net.to(device_) + net.eval() + + forward_input = torch.rand(1, 1, 3, 3).to(device_) traced = torch.jit.trace(net, forward_input) # type: ignore[no-untyped-call] + buffer = io.BytesIO() torch.jit.save(traced, buffer) # type: ignore[no-untyped-call] model = buffer.getvalue() @@ -261,7 +302,7 @@ def _test_onnx_install(client: Client, device: _TCapitalDeviceStr) -> None: from sklearn.cluster import KMeans data = np.arange(20, dtype=np.float32).reshape(10, 2) - model = KMeans(n_clusters=2) + model = KMeans(n_clusters=2, n_init=10) model.fit(data) kmeans = to_onnx(model, data, target_opset=11) diff --git a/smartsim/_core/_install/__init__.py b/smartsim/_core/_install/__init__.py index bf6fd954c..efe03908e 100644 --- a/smartsim/_core/_install/__init__.py +++ b/smartsim/_core/_install/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index eaa2c68bd..85090ba0a 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -193,23 +193,17 @@ class RedisAIVersion(Version_): defaults = { "1.2.7": { - "tensorflow": "2.8.0", - "onnx": "1.11.0", - "skl2onnx": "1.11.1", - "onnxmltools": "1.11.1", - "scikit-learn": "1.1.1", - "torch": "1.11.0", + "tensorflow": "2.13.1", + "onnx": "1.14.1", + "skl2onnx": "1.16.0", + "onnxmltools": "1.12.0", + "scikit-learn": "1.3.2", + "torch": "2.0.1", "torch_cpu_suffix": "+cpu", - "torch_cuda_suffix": "+cu113", - "torchvision": "0.12.0", + "torch_cuda_suffix": "+cu117", + "torchvision": "0.15.2", }, } - # Remove options with unsported wheels for python>=3.10 - if sys.version_info >= (3, 10): - defaults["1.2.7"].pop("onnx") - defaults["1.2.7"].pop("skl2onnx") - defaults["1.2.7"].pop("onnxmltools") - defaults["1.2.7"].pop("scikit-learn") def __init__(self, vers: str) -> None: # pylint: disable=super-init-not-called min_rai_version = min(Version_(ver) for ver in self.defaults) @@ -276,8 +270,8 @@ class Versioner: PYTHON_MIN = Version_("3.8.0") # Versions - SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.6.0")) - SMARTREDIS = Version_(get_env("SMARTREDIS_VERSION", "0.5.0")) + SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.6.1")) + SMARTREDIS = Version_(get_env("SMARTREDIS_VERSION", "0.5.1")) SMARTSIM_SUFFIX = get_env("SMARTSIM_SUFFIX", "") # Redis @@ -304,33 +298,19 @@ class Versioner: # TensorFlow and ONNX only use the defaults, but these are not built into # the RedisAI package and therefore the user is free to pick other versions. TENSORFLOW = Version_(REDISAI.tensorflow) - try: - ONNX = Version_(REDISAI.onnx) - except AttributeError: - ONNX = None - - def as_dict(self, db_name: DbEngine = "REDIS") -> t.Dict[str, t.Any]: - packages = [ - "SMARTSIM", - "SMARTREDIS", - db_name, - "REDISAI", - "TORCH", - "TENSORFLOW", - ] - versions = [ - self.SMARTSIM, - self.SMARTREDIS, - self.REDIS, - self.REDISAI, - self.TORCH, - self.TENSORFLOW, - ] - if self.ONNX: - packages.append("ONNX") - versions.append(self.ONNX) - vers = {"Packages": packages, "Versions": versions} - return vers + ONNX = Version_(REDISAI.onnx) + + def as_dict(self, db_name: DbEngine = "REDIS") -> t.Dict[str, t.Tuple[str, ...]]: + pkg_map = { + "SMARTSIM": self.SMARTSIM, + "SMARTREDIS": self.SMARTREDIS, + db_name: self.REDIS, + "REDISAI": self.REDISAI, + "TORCH": self.TORCH, + "TENSORFLOW": self.TENSORFLOW, + "ONNX": self.ONNX, + } + return {"Packages": tuple(pkg_map), "Versions": tuple(pkg_map.values())} def ml_extras_required(self) -> t.Dict[str, t.List[str]]: """Optional ML/DL dependencies we suggest for the user. diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index f96a9bb5f..c098cfd01 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -24,25 +24,39 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# pylint: disable=too-many-lines + +import concurrent.futures +import enum +import itertools import os +import platform import re import shutil import stat import subprocess import sys +import tarfile +import tempfile import typing as t +import urllib.request +import zipfile +from abc import ABC, abstractmethod +from dataclasses import dataclass from pathlib import Path from shutil import which from subprocess import SubprocessError -# NOTE: This will be imported by setup.py and hence no -# smartsim related items should be imported into -# this file. +# NOTE: This will be imported by setup.py and hence no smartsim related +# items should be imported into this file. -# TODO: -# - check cmake version and use system if possible to avoid conflicts +# TODO: check cmake version and use system if possible to avoid conflicts TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime", "tflite"] +TDeviceStr = t.Literal["cpu", "gpu"] + +_T = t.TypeVar("_T") +_U = t.TypeVar("_U") def expand_exe_path(exe: str) -> str: @@ -69,6 +83,37 @@ class BuildError(Exception): pass +class Architecture(enum.Enum): + X64 = ("x86_64", "amd64") + ARM64 = ("arm64",) + + @classmethod + def from_str(cls, string: str, /) -> "Architecture": + string = string.lower() + for type_ in cls: + if string in type_.value: + return type_ + raise BuildError(f"Unrecognized or unsupported architecture: {string}") + + +class OperatingSystem(enum.Enum): + LINUX = ("linux", "linux2") + DARWIN = ("darwin",) + + @classmethod + def from_str(cls, string: str, /) -> "OperatingSystem": + string = string.lower() + for type_ in cls: + if string in type_.value: + return type_ + raise BuildError(f"Unrecognized or unsupported operating system: {string}") + + +class Platform(t.NamedTuple): + os: OperatingSystem + architecture: Architecture + + class Builder: """Base class for building third-party libraries""" @@ -83,10 +128,16 @@ class Builder: ) def __init__( - self, env: t.Dict[str, t.Any], jobs: t.Optional[int] = 1, verbose: bool = False + self, + env: t.Dict[str, str], + jobs: int = 1, + _os: OperatingSystem = OperatingSystem.from_str(platform.system()), + architecture: Architecture = Architecture.from_str(platform.machine()), + verbose: bool = False, ) -> None: # build environment from buildenv self.env = env + self._platform = Platform(_os, architecture) # Find _core directory and set up paths _core_dir = Path(os.path.abspath(__file__)).parent.parent @@ -99,12 +150,7 @@ def __init__( self.bin_path = dependency_path / "bin" self.lib_path = dependency_path / "lib" - - # Set wether build process will output to std output - self.out: t.Optional[int] = subprocess.DEVNULL self.verbose = verbose - if self.verbose: - self.out = None # make build directory "SmartSim/smartsim/_core/.third-party" if not self.build_dir.is_dir(): @@ -117,12 +163,18 @@ def __init__( self.jobs = jobs + @property + def out(self) -> t.Optional[int]: + return None if self.verbose else subprocess.DEVNULL + # implemented in base classes @property def is_built(self) -> bool: raise NotImplementedError - def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None: + def build_from_git( + self, git_url: str, branch: str, device: TDeviceStr = "cpu" + ) -> None: raise NotImplementedError @staticmethod @@ -197,12 +249,20 @@ class DatabaseBuilder(Builder): def __init__( self, - build_env: t.Optional[t.Dict[str, t.Any]] = None, + build_env: t.Optional[t.Dict[str, str]] = None, malloc: str = "libc", - jobs: t.Optional[int] = None, + jobs: int = 1, + _os: OperatingSystem = OperatingSystem.from_str(platform.system()), + architecture: Architecture = Architecture.from_str(platform.machine()), verbose: bool = False, ) -> None: - super().__init__(build_env or {}, jobs=jobs, verbose=verbose) + super().__init__( + build_env or {}, + jobs=jobs, + _os=_os, + architecture=architecture, + verbose=verbose, + ) self.malloc = malloc @property @@ -213,7 +273,9 @@ def is_built(self) -> bool: keydb_files = {"keydb-server", "keydb-cli"} return redis_files.issubset(bin_files) or keydb_files.issubset(bin_files) - def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None: + def build_from_git( + self, git_url: str, branch: str, device: TDeviceStr = "cpu" + ) -> None: """Build Redis from git :param git_url: url from which to retrieve Redis :type git_url: str @@ -237,17 +299,21 @@ def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None if not self.is_valid_url(git_url): raise BuildError(f"Malformed {database_name} URL: {git_url}") + clone_cmd = config_git_command( + self._platform, + [ + self.binary_path("git"), + "clone", + git_url, + "--branch", + branch, + "--depth", + "1", + database_name, + ], + ) + # clone Redis - clone_cmd = [ - self.binary_path("git"), - "clone", - git_url, - "--branch", - branch, - "--depth", - "1", - database_name, - ] self.run_command(clone_cmd, cwd=self.build_dir) # build Redis @@ -288,6 +354,37 @@ def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None raise BuildError("Installation of redis-cli failed!") from e +class _RAIBuildDependency(ABC): + """An interface with a collection of magic methods so that + ``RedisAIBuilder`` can fetch and place its own dependencies + """ + + @property + @abstractmethod + def __rai_dependency_name__(self) -> str: ... + + @abstractmethod + def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: ... + + @staticmethod + @abstractmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: ... + + +def _place_rai_dep_at( + target: t.Union[str, "os.PathLike[str]"], verbose: bool +) -> t.Callable[[_RAIBuildDependency], Path]: + def _place(dep: _RAIBuildDependency) -> Path: + if verbose: + print(f"Placing: '{dep.__rai_dependency_name__}'") + path = dep.__place_for_rai__(target) + if verbose: + print(f"Placed: '{dep.__rai_dependency_name__}' at '{path}'") + return path + + return _place + + class RedisAIBuilder(Builder): """Class to build RedisAI from Source Supported build method: @@ -298,16 +395,25 @@ class RedisAIBuilder(Builder): def __init__( self, - build_env: t.Optional[t.Dict[str, t.Any]] = None, + _os: OperatingSystem = OperatingSystem.from_str(platform.system()), + architecture: Architecture = Architecture.from_str(platform.machine()), + build_env: t.Optional[t.Dict[str, str]] = None, torch_dir: str = "", libtf_dir: str = "", build_torch: bool = True, build_tf: bool = True, build_onnx: bool = False, - jobs: t.Optional[int] = None, + jobs: int = 1, verbose: bool = False, ) -> None: - super().__init__(build_env or {}, jobs=jobs, verbose=verbose) + super().__init__( + build_env or {}, + jobs=jobs, + _os=_os, + architecture=architecture, + verbose=verbose, + ) + self.rai_install_path: t.Optional[Path] = None # convert to int for RAI build script @@ -317,6 +423,29 @@ def __init__( self.libtf_dir = libtf_dir self.torch_dir = torch_dir + # Sanity checks + self._validate_platform() + + def _validate_platform(self) -> None: + unsupported = [] + if self._platform not in _DLPackRepository.supported_platforms(): + unsupported.append("DLPack") + if self.fetch_tf and (self._platform not in _TFArchive.supported_platforms()): + unsupported.append("Tensorflow") + if self.fetch_onnx and ( + self._platform not in _ORTArchive.supported_platforms() + ): + unsupported.append("ONNX") + if self.fetch_torch and ( + self._platform not in _PTArchive.supported_platforms() + ): + unsupported.append("PyTorch") + if unsupported: + raise BuildError( + f"The {', '.join(unsupported)} backend(s) are not supported " + f"on {self._platform.os} with {self._platform.architecture}" + ) + @property def rai_build_path(self) -> Path: return Path(self.build_dir, "RedisAI") @@ -351,6 +480,47 @@ def build_onnx(self) -> bool: def fetch_onnx(self) -> bool: return self.build_onnx + def get_deps_dir_path_for(self, device: TDeviceStr) -> Path: + def fail_to_format(reason: str) -> BuildError: # pragma: no cover + return BuildError(f"Failed to format RedisAI dependency path: {reason}") + + _os, architecture = self._platform + if _os == OperatingSystem.DARWIN: + os_ = "macos" + elif _os == OperatingSystem.LINUX: + os_ = "linux" + else: # pragma: no cover + raise fail_to_format(f"Unknown operating system: {_os}") + if architecture == Architecture.X64: + arch = "x64" + elif architecture == Architecture.ARM64: + arch = "arm64v8" + else: # pragma: no cover + raise fail_to_format(f"Unknown architecture: {architecture}") + return self.rai_build_path / f"deps/{os_}-{arch}-{device}" + + def _get_deps_to_fetch_for( + self, device: TDeviceStr + ) -> t.Tuple[_RAIBuildDependency, ...]: + os_, arch = self._platform + # TODO: It would be nice if the backend version numbers were declared + # alongside the python package version numbers so that all of the + # dependency versions were declared in single location. + # Unfortunately importing into this module is non-trivial as it + # is used as script in the SmartSim `setup.py`. + + # DLPack is always required + fetchable_deps: t.List[_RAIBuildDependency] = [_DLPackRepository("v0.5_RAI")] + if self.fetch_torch: + pt_dep = _choose_pt_variant(os_) + fetchable_deps.append(pt_dep(arch, device, "2.0.1")) + if self.fetch_tf: + fetchable_deps.append(_TFArchive(os_, arch, device, "2.13.1")) + if self.fetch_onnx: + fetchable_deps.append(_ORTArchive(os_, device, "1.16.3")) + + return tuple(fetchable_deps) + def symlink_libtf(self, device: str) -> None: """Add symbolic link to available libtensorflow in RedisAI deps. @@ -406,7 +576,9 @@ def symlink_libtf(self, device: str) -> None: if not dst_file.is_file(): os.symlink(src_file, dst_file) - def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None: + def build_from_git( + self, git_url: str, branch: str, device: TDeviceStr = "cpu" + ) -> None: """Build RedisAI from git :param git_url: url from which to retrieve RedisAI @@ -425,63 +597,24 @@ def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None raise BuildError(f"Malformed RedisAI URL: {git_url}") # clone RedisAI - clone_cmd = [ - self.binary_path("env"), - "GIT_LFS_SKIP_SMUDGE=1", - "git", - "clone", - "--recursive", - git_url, - ] - - checkout_osx_fix: t.List[str] = [] - - # Circumvent a bad `get_deps.sh` script from RAI on 1.2.7 with ONNX - # TODO: Look for a better way to do this or wait for RAI patch - if branch == "v1.2.7": - # Clone RAI patch commit for OSX - clone_cmd += ["RedisAI"] - checkout_osx_fix = [ + clone_cmd = config_git_command( + self._platform, + [ + self.binary_path("env"), + "GIT_LFS_SKIP_SMUDGE=1", "git", - "checkout", - "634916c722e718cc6ea3fad46e63f7d798f9adc2", - ] - else: - # Clone RAI release commit for versions > 1.2.7 - clone_cmd += [ + "clone", + "--recursive", + git_url, "--branch", branch, "--depth=1", - "RedisAI", - ] - - self.run_command(clone_cmd, out=subprocess.DEVNULL, cwd=self.build_dir) - if checkout_osx_fix: - self.run_command( - checkout_osx_fix, out=subprocess.DEVNULL, cwd=self.rai_build_path - ) - - # get RedisAI dependencies - dep_cmd = self._rai_build_env_prefix( - with_pt=self.build_torch, - with_tf=self.build_tf, - with_ort=self.build_onnx, - extra_env={"VERBOSE": "1"}, - ) - - dep_cmd.extend( - [ - self.binary_path("bash"), - str(self.rai_build_path / "get_deps.sh"), - str(device), - ] + os.fspath(self.rai_build_path), + ], ) - self.run_command( - dep_cmd, - out=subprocess.DEVNULL, # suppress this as it's not useful - cwd=self.rai_build_path, - ) + self.run_command(clone_cmd, out=subprocess.DEVNULL, cwd=self.build_dir) + self._fetch_deps_for(device) if self.libtf_dir and device: self.symlink_libtf(device) @@ -541,6 +674,25 @@ def _rai_build_env_prefix( *(f"{key}={val}" for key, val in extra_env.items()), ] + def _fetch_deps_for(self, device: TDeviceStr) -> None: + if not self.rai_build_path.is_dir(): + raise BuildError("RedisAI build directory not found") + + deps_dir = self.get_deps_dir_path_for(device) + deps_dir.mkdir(parents=True, exist_ok=True) + if any(deps_dir.iterdir()): + raise BuildError("RAI build dependency directory is not empty") + to_fetch = self._get_deps_to_fetch_for(device) + placed_paths = _threaded_map( + _place_rai_dep_at(deps_dir, self.verbose), to_fetch + ) + unique_placed_paths = {os.fspath(path.resolve()) for path in placed_paths} + if len(unique_placed_paths) != len(to_fetch): + raise BuildError( + f"Expected to place {len(to_fetch)} dependencies, but only " + f"found {len(unique_placed_paths)}" + ) + def _install_backends(self, device: str) -> None: """Move backend libraries to smartsim/_core/lib/ :param device: cpu or cpu @@ -578,3 +730,319 @@ def _move_torch_libs(self) -> None: if sys.platform == "darwin": dylibs = pip_torch_path / ".dylibs" self.copy_dir(dylibs, ss_rai_torch_path / ".dylibs", set_exe=True) + + +def _threaded_map(fn: t.Callable[[_T], _U], items: t.Iterable[_T]) -> t.Sequence[_U]: + items = tuple(items) + if not items: # No items so no work to do + return () + num_workers = min(len(items), (os.cpu_count() or 4) * 5) + with concurrent.futures.ThreadPoolExecutor(num_workers) as pool: + return tuple(pool.map(fn, items)) + + +class _WebLocation(ABC): + @property + @abstractmethod + def url(self) -> str: ... + + +class _WebGitRepository(_WebLocation): + def clone( + self, + target: t.Union[str, "os.PathLike[str]"], + depth: t.Optional[int] = None, + branch: t.Optional[str] = None, + ) -> None: + depth_ = ("--depth", str(depth)) if depth is not None else () + branch_ = ("--branch", branch) if branch is not None else () + _git("clone", "-q", *depth_, *branch_, self.url, os.fspath(target)) + + +@t.final +@dataclass(frozen=True) +class _DLPackRepository(_WebGitRepository, _RAIBuildDependency): + version: str + + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + return ( + (OperatingSystem.LINUX, Architecture.X64), + (OperatingSystem.DARWIN, Architecture.X64), + (OperatingSystem.DARWIN, Architecture.ARM64), + ) + + @property + def url(self) -> str: + return "https://github.com/RedisAI/dlpack.git" + + @property + def __rai_dependency_name__(self) -> str: + return f"dlpack@{self.url}" + + def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + target = Path(target) / "dlpack" + self.clone(target, branch=self.version, depth=1) + if not target.is_dir(): + raise BuildError("Failed to place dlpack") + return target + + +class _WebArchive(_WebLocation): + @property + def name(self) -> str: + _, name = self.url.rsplit("/", 1) + return name + + def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + target = Path(target) + if target.is_dir(): + target = target / self.name + file, _ = urllib.request.urlretrieve(self.url, target) + return Path(file).resolve() + + +class _ExtractableWebArchive(_WebArchive, ABC): + @abstractmethod + def _extract_download( + self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] + ) -> None: ... + + def extract(self, target: t.Union[str, "os.PathLike[str]"]) -> None: + with tempfile.TemporaryDirectory() as tmp_dir: + arch_path = self.download(tmp_dir) + self._extract_download(arch_path, target) + + +class _WebTGZ(_ExtractableWebArchive): + def _extract_download( + self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] + ) -> None: + with tarfile.open(download_path, "r") as tgz_file: + tgz_file.extractall(target) + + +class _WebZip(_ExtractableWebArchive): + def _extract_download( + self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] + ) -> None: + with zipfile.ZipFile(download_path, "r") as zip_file: + zip_file.extractall(target) + + +@dataclass(frozen=True) +class _PTArchive(_WebZip, _RAIBuildDependency): + architecture: Architecture + device: TDeviceStr + version: str + + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + # TODO: This will need to be revisited if the inheritance tree gets deeper + return tuple( + itertools.chain.from_iterable( + var.supported_platforms() for var in _PTArchive.__subclasses__() + ) + ) + + @property + def __rai_dependency_name__(self) -> str: + return f"libtorch@{self.url}" + + def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + self.extract(target) + target = Path(target) / "libtorch" + if not target.is_dir(): + raise BuildError("Failed to place RAI dependency: `libtorch`") + return target + + +@t.final +class _PTArchiveLinux(_PTArchive): + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + return ((OperatingSystem.LINUX, Architecture.X64),) + + @property + def url(self) -> str: + if self.device == "gpu": + pt_build = "cu117" + else: + pt_build = "cpu" + # pylint: disable-next=line-too-long + libtorch_archive = ( + f"libtorch-cxx11-abi-shared-without-deps-{self.version}%2B{pt_build}.zip" + ) + return f"https://download.pytorch.org/libtorch/{pt_build}/{libtorch_archive}" + + +@t.final +class _PTArchiveMacOSX(_PTArchive): + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + return ( + (OperatingSystem.DARWIN, Architecture.ARM64), + (OperatingSystem.DARWIN, Architecture.X64), + ) + + @property + def url(self) -> str: + if self.device == "gpu": + raise BuildError("RedisAI does not currently support GPU on Mac OSX") + if self.architecture == Architecture.X64: + pt_build = "cpu" + libtorch_archive = f"libtorch-macos-{self.version}.zip" + root_url = "https://download.pytorch.org/libtorch" + return f"{root_url}/{pt_build}/{libtorch_archive}" + if self.architecture == Architecture.ARM64: + libtorch_archive = f"libtorch-macos-arm64-{self.version}.zip" + # pylint: disable-next=line-too-long + root_url = ( + "https://github.com/CrayLabs/ml_lib_builder/releases/download/v0.1/" + ) + return f"{root_url}/{libtorch_archive}" + + raise BuildError("Unsupported architecture for Pytorch: {self.architecture}") + + +def _choose_pt_variant( + os_: OperatingSystem, +) -> t.Union[t.Type[_PTArchiveLinux], t.Type[_PTArchiveMacOSX]]: + if os_ == OperatingSystem.DARWIN: + return _PTArchiveMacOSX + if os_ == OperatingSystem.LINUX: + return _PTArchiveLinux + + raise BuildError(f"Unsupported OS for PyTorch: {os_}") + + +@t.final +@dataclass(frozen=True) +class _TFArchive(_WebTGZ, _RAIBuildDependency): + os_: OperatingSystem + architecture: Architecture + device: TDeviceStr + version: str + + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + return ( + (OperatingSystem.LINUX, Architecture.X64), + (OperatingSystem.DARWIN, Architecture.X64), + ) + + @property + def url(self) -> str: + if self.architecture == Architecture.X64: + tf_arch = "x86_64" + else: + raise BuildError( + "Unexpected Architecture for TF Archive: {self.architecture}" + ) + + if self.os_ == OperatingSystem.LINUX: + tf_os = "linux" + tf_device = self.device + elif self.os_ == OperatingSystem.DARWIN: + tf_os = "darwin" + if self.device == "gpu": + raise BuildError("RedisAI does not currently support GPU on Macos") + tf_device = "cpu" + else: + raise BuildError("Unexpected OS for TF Archive: {self.os_}") + return ( + "https://storage.googleapis.com/tensorflow/libtensorflow/" + f"libtensorflow-{tf_device}-{tf_os}-{tf_arch}-{self.version}.tar.gz" + ) + + @property + def __rai_dependency_name__(self) -> str: + return f"libtensorflow@{self.url}" + + def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + target = Path(target) / "libtensorflow" + target.mkdir() + self.extract(target) + return target + + +@t.final +@dataclass(frozen=True) +class _ORTArchive(_WebTGZ, _RAIBuildDependency): + os_: OperatingSystem + device: TDeviceStr + version: str + + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + return ( + (OperatingSystem.LINUX, Architecture.X64), + (OperatingSystem.DARWIN, Architecture.X64), + ) + + @property + def url(self) -> str: + ort_url_base = ( + "https://github.com/microsoft/onnxruntime/releases/" + f"download/v{self.version}" + ) + if self.os_ == OperatingSystem.LINUX: + ort_os = "linux" + ort_arch = "x64" + ort_build = "-gpu" if self.device == "gpu" else "" + elif self.os_ == OperatingSystem.DARWIN: + ort_os = "osx" + ort_arch = "x86_64" + ort_build = "" + if self.device == "gpu": + raise BuildError("RedisAI does not currently support GPU on Macos") + else: + raise BuildError("Unexpected OS for TF Archive: {self.os_}") + ort_archive = f"onnxruntime-{ort_os}-{ort_arch}{ort_build}-{self.version}.tgz" + return f"{ort_url_base}/{ort_archive}" + + @property + def __rai_dependency_name__(self) -> str: + return f"onnxruntime@{self.url}" + + def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + target = Path(target).resolve() / "onnxruntime" + self.extract(target) + try: + (extracted_dir,) = target.iterdir() + except ValueError: + raise BuildError( + "Unexpected number of files extracted from ORT archive" + ) from None + for file in extracted_dir.iterdir(): + file.rename(target / file.name) + extracted_dir.rmdir() + return target + + +def _git(*args: str) -> None: + git = Builder.binary_path("git") + cmd = (git,) + args + with subprocess.Popen(cmd) as proc: + proc.wait() + if proc.returncode != 0: + raise BuildError( + f"Command `{' '.join(cmd)}` failed with exit code {proc.returncode}" + ) + + +def config_git_command(plat: Platform, cmd: t.Sequence[str]) -> t.List[str]: + """Modify git commands to include autocrlf when on a platform that needs + autocrlf enabled to behave correctly + """ + cmd = list(cmd) + where = next((i for i, tok in enumerate(cmd) if tok.endswith("git")), len(cmd)) + 2 + if where >= len(cmd): + raise ValueError(f"Failed to locate git command in '{' '.join(cmd)}'") + if plat == Platform(OperatingSystem.DARWIN, Architecture.ARM64): + cmd = ( + cmd[:where] + + ["--config", "core.autocrlf=false", "--config", "core.eol=lf"] + + cmd[where:] + ) + return cmd diff --git a/smartsim/_core/config/__init__.py b/smartsim/_core/config/__init__.py index 97e3caf18..1637d6a2f 100644 --- a/smartsim/_core/config/__init__.py +++ b/smartsim/_core/config/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index af5ebe508..42a548c42 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -225,12 +225,16 @@ def telemetry_frequency(self) -> int: @property def telemetry_enabled(self) -> bool: - return int(os.environ.get("SMARTSIM_FLAG_TELEMETRY", "0")) > 0 + return int(os.environ.get("SMARTSIM_FLAG_TELEMETRY", "1")) > 0 @property def telemetry_cooldown(self) -> int: return int(os.environ.get("SMARTSIM_TELEMETRY_COOLDOWN", 90)) + @property + def telemetry_subdir(self) -> str: + return ".smartsim/telemetry" + @lru_cache(maxsize=128, typed=False) def get_config() -> Config: diff --git a/smartsim/_core/control/__init__.py b/smartsim/_core/control/__init__.py index 2a89c04b5..0acd80650 100644 --- a/smartsim/_core/control/__init__.py +++ b/smartsim/_core/control/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index e3e463c51..3b673970a 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -63,13 +63,7 @@ from ...servertype import CLUSTERED, STANDALONE from ...status import STATUS_CANCELLED, STATUS_RUNNING, TERMINAL_STATUSES from ..config import CONFIG -from ..launcher import ( - CobaltLauncher, - LocalLauncher, - LSFLauncher, - PBSLauncher, - SlurmLauncher, -) +from ..launcher import LocalLauncher, LSFLauncher, PBSLauncher, SlurmLauncher from ..launcher.launcher import Launcher from ..utils import check_cluster_status, create_cluster, serialize from .job import Job @@ -318,7 +312,7 @@ def get_entity_list_status( def init_launcher(self, launcher: str) -> None: """Initialize the controller with a specific type of launcher. - SmartSim currently supports slurm, pbs(pro), cobalt, lsf, + SmartSim currently supports slurm, pbs(pro), lsf, and local launching :param launcher: which launcher to initialize @@ -331,7 +325,6 @@ def init_launcher(self, launcher: str) -> None: "slurm": SlurmLauncher, "pbs": PBSLauncher, "pals": PBSLauncher, - "cobalt": CobaltLauncher, "lsf": LSFLauncher, "local": LocalLauncher, } @@ -631,7 +624,7 @@ def _prep_entity_client_env(self, entity: Model) -> None: # Set address to local if it's a colocated model if entity.colocated and entity.run_settings.colocated_db_settings is not None: db_name_colo = entity.run_settings.colocated_db_settings["db_identifier"] - + assert isinstance(db_name_colo, str) for key in address_dict: _, db_id = unpack_db_identifier(key, "_") if db_name_colo == db_id: @@ -842,11 +835,11 @@ def _start_telemetry_monitor(self, exp_dir: str) -> None: :param exp_dir: An experiment directory :type exp_dir: str """ - logger.debug("Starting telemetry monitor process") if ( self._telemetry_monitor is None or self._telemetry_monitor.returncode is not None ): + logger.debug("Starting telemetry monitor process") cmd = [ sys.executable, "-m", @@ -866,6 +859,7 @@ def _start_telemetry_monitor(self, exp_dir: str) -> None: cwd=str(pathlib.Path(__file__).parent.parent.parent), shell=False, ) + logger.debug("Telemetry monitor started") class _AnonymousBatchJob(EntityList[Model]): diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index aa4ecce76..f3bd8cf3a 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index d23543030..e482b9951 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +34,7 @@ from ...database import Orchestrator from ...entity import DBNode, EntitySequence, SmartSimEntity -from ...log import get_logger +from ...log import ContextThread, get_logger from ...status import STATUS_NEVER_STARTED, TERMINAL_STATUSES from ..config import CONFIG from ..launcher import Launcher, LocalLauncher @@ -80,7 +80,7 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None: def start(self) -> None: """Start a thread for the job manager""" - self.monitor = Thread(name="JobManager", daemon=True, target=self.run) + self.monitor = ContextThread(name="JobManager", daemon=True, target=self.run) self.monitor.start() def run(self) -> None: diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 62ab013e5..25037540c 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import itertools import pathlib import typing as t from dataclasses import dataclass, field @@ -31,6 +32,7 @@ from ...database import Orchestrator from ...entity import DBNode, Ensemble, EntitySequence, Model, SmartSimEntity from ...error import SmartSimError +from ..config import CONFIG from ..utils import helpers as _helpers from ..utils import serialize as _serialize @@ -177,52 +179,12 @@ def __str__(self) -> str: @property def has_db_objects(self) -> bool: """Check if any entity has DBObjects to set""" - - def has_db_models( - entity: t.Union[EntitySequence[SmartSimEntity], Model] - ) -> bool: - return len(list(entity.db_models)) > 0 - - def has_db_scripts( - entity: t.Union[EntitySequence[SmartSimEntity], Model] - ) -> bool: - return len(list(entity.db_scripts)) > 0 - - has_db_objects = False - - # Check if any model has either a DBModel or a DBScript - # we update has_db_objects so that as soon as one check - # returns True, we can exit - has_db_objects |= any( - has_db_models(model) | has_db_scripts(model) for model in self.models + ents: t.Iterable[t.Union[Model, Ensemble]] = itertools.chain( + self.models, + self.ensembles, + (member for ens in self.ensembles for member in ens.entities), ) - if has_db_objects: - return True - - # If there are no ensembles, there can be no outstanding model - # to check for DBObjects, return current value of DBObjects, which - # should be False - ensembles = self.ensembles - if not ensembles: - return has_db_objects - - # First check if there is any ensemble DBObject, if so, return True - has_db_objects |= any( - has_db_models(ensemble) | has_db_scripts(ensemble) for ensemble in ensembles - ) - if has_db_objects: - return True - for ensemble in ensembles: - # Last case, check if any model within an ensemble has DBObjects attached - has_db_objects |= any( - has_db_models(model) | has_db_scripts(model) - for model in ensemble.models - ) - if has_db_objects: - return True - - # `has_db_objects` should be False here - return has_db_objects + return any(any(ent.db_models) or any(ent.db_scripts) for ent in ents) class _LaunchedManifestMetadata(t.NamedTuple): @@ -343,7 +305,7 @@ def finalize(self) -> LaunchedManifest[_T]: def _format_exp_telemetry_path( exp_path: t.Union[str, "os.PathLike[str]"] ) -> pathlib.Path: - return pathlib.Path(exp_path, _serialize.TELMON_SUBDIR) + return pathlib.Path(exp_path, CONFIG.telemetry_subdir) def _format_run_telemetry_path( diff --git a/smartsim/_core/entrypoints/__init__.py b/smartsim/_core/entrypoints/__init__.py index bf6fd954c..efe03908e 100644 --- a/smartsim/_core/entrypoints/__init__.py +++ b/smartsim/_core/entrypoints/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index 332d6e019..600ae2ff3 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/entrypoints/indirect.py b/smartsim/_core/entrypoints/indirect.py index 18d27601f..f94ad6e61 100644 --- a/smartsim/_core/entrypoints/indirect.py +++ b/smartsim/_core/entrypoints/indirect.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index ef9911829..018fc26fd 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py index 86d6fe72f..115528bf4 100644 --- a/smartsim/_core/entrypoints/telemetrymonitor.py +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -49,7 +49,6 @@ from smartsim._core.config import CONFIG from smartsim._core.control.job import JobEntity, _JobKey from smartsim._core.control.jobmanager import JobManager -from smartsim._core.launcher.cobalt.cobaltLauncher import CobaltLauncher from smartsim._core.launcher.launcher import Launcher from smartsim._core.launcher.local.local import LocalLauncher from smartsim._core.launcher.lsf.lsfLauncher import LSFLauncher @@ -57,7 +56,7 @@ from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher from smartsim._core.launcher.stepInfo import StepInfo from smartsim._core.utils.helpers import get_ts -from smartsim._core.utils.serialize import MANIFEST_FILENAME, TELMON_SUBDIR +from smartsim._core.utils.serialize import MANIFEST_FILENAME from smartsim.error.errors import SmartSimError from smartsim.status import STATUS_COMPLETED, TERMINAL_STATUSES @@ -325,14 +324,13 @@ def __init__( self._launcher_map: t.Dict[str, t.Type[Launcher]] = { "slurm": SlurmLauncher, "pbs": PBSLauncher, - "cobalt": CobaltLauncher, "lsf": LSFLauncher, "local": LocalLauncher, } def init_launcher(self, launcher: str) -> Launcher: """Initialize the controller with a specific type of launcher. - SmartSim currently supports slurm, pbs(pro), cobalt, lsf, + SmartSim currently supports slurm, pbs(pro), lsf, and local launching :param launcher: which launcher to initialize @@ -582,7 +580,7 @@ def main( poll for new jobs before attempting to shutdown :type cooldown_duration: int """ - manifest_relpath = pathlib.Path(TELMON_SUBDIR) / MANIFEST_FILENAME + manifest_relpath = pathlib.Path(CONFIG.telemetry_subdir) / MANIFEST_FILENAME manifest_path = experiment_dir / manifest_relpath monitor_pattern = str(manifest_relpath) @@ -667,7 +665,9 @@ def get_parser() -> argparse.ArgumentParser: log.setLevel(logging.DEBUG) log.propagate = False - log_path = os.path.join(args.exp_dir, TELMON_SUBDIR, "telemetrymonitor.log") + log_path = os.path.join( + args.exp_dir, CONFIG.telemetry_subdir, "telemetrymonitor.log" + ) fh = logging.FileHandler(log_path, "a") log.addHandler(fh) diff --git a/smartsim/_core/generation/__init__.py b/smartsim/_core/generation/__init__.py index 10470e2d5..5224f8498 100644 --- a/smartsim/_core/generation/__init__.py +++ b/smartsim/_core/generation/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/generation/generator.py b/smartsim/_core/generation/generator.py index 79cea06b7..502753df7 100644 --- a/smartsim/_core/generation/generator.py +++ b/smartsim/_core/generation/generator.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/generation/modelwriter.py b/smartsim/_core/generation/modelwriter.py index 0cf071082..3062ea1db 100644 --- a/smartsim/_core/generation/modelwriter.py +++ b/smartsim/_core/generation/modelwriter.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/__init__.py b/smartsim/_core/launcher/__init__.py index 6e1aa724e..0c4001cd4 100644 --- a/smartsim/_core/launcher/__init__.py +++ b/smartsim/_core/launcher/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .cobalt.cobaltLauncher import CobaltLauncher from .launcher import Launcher from .local.local import LocalLauncher from .lsf.lsfLauncher import LSFLauncher @@ -33,7 +32,6 @@ __all__ = [ "Launcher", - "CobaltLauncher", "LocalLauncher", "LSFLauncher", "PBSLauncher", diff --git a/smartsim/_core/launcher/cobalt/__init__.py b/smartsim/_core/launcher/cobalt/__init__.py deleted file mode 100644 index bf6fd954c..000000000 --- a/smartsim/_core/launcher/cobalt/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/launcher/cobalt/cobaltCommands.py b/smartsim/_core/launcher/cobalt/cobaltCommands.py deleted file mode 100644 index bf6fd954c..000000000 --- a/smartsim/_core/launcher/cobalt/cobaltCommands.py +++ /dev/null @@ -1,25 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/launcher/cobalt/cobaltLauncher.py b/smartsim/_core/launcher/cobalt/cobaltLauncher.py deleted file mode 100644 index 56ebe12cc..000000000 --- a/smartsim/_core/launcher/cobalt/cobaltLauncher.py +++ /dev/null @@ -1,207 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import time -import typing as t - -import psutil - -from smartsim._core.launcher.step import Step -from smartsim.settings import ( - AprunSettings, - CobaltBatchSettings, - MpiexecSettings, - MpirunSettings, - OrterunSettings, - RunSettings, - SettingsBase, -) - -from ....error import LauncherError -from ....log import get_logger -from ....status import STATUS_CANCELLED, STATUS_COMPLETED -from ...config import CONFIG -from ..launcher import WLMLauncher -from ..pbs.pbsCommands import qdel, qstat -from ..step import ( - AprunStep, - CobaltBatchStep, - LocalStep, - MpiexecStep, - MpirunStep, - OrterunStep, - Step, -) -from ..stepInfo import CobaltStepInfo, StepInfo -from .cobaltParser import parse_cobalt_step_id, parse_cobalt_step_status, parse_qsub_out - -logger = get_logger(__name__) - - -class CobaltLauncher(WLMLauncher): - """This class encapsulates the functionality needed - to launch jobs on systems that use Cobalt as a workload manager. - - All WLM launchers are capable of launching managed and unmanaged - jobs. Managed jobs are queried through interaction with with WLM, - in this case Cobalt. Unmanaged jobs are held in the TaskManager - and are managed through references to their launching process ID - i.e. a psutil.Popen object - """ - - def __init__(self) -> None: - super().__init__() - self.user = psutil.Process().username() - - @property - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: - # RunSettings types supported by this launcher - return { - AprunSettings: AprunStep, - CobaltBatchSettings: CobaltBatchStep, - MpirunSettings: MpirunStep, - MpiexecSettings: MpiexecStep, - OrterunSettings: OrterunStep, - RunSettings: LocalStep, - } - - def run(self, step: Step) -> t.Optional[str]: - """Run a job step through Cobalt - - :param step: a job step instance - :type step: Step - :raises LauncherError: if launch fails - :return: job step id if job is managed - :rtype: str - """ - if not self.task_manager.actively_monitoring: - self.task_manager.start() - - cmd_list = step.get_launch_cmd() - step_id = None - task_id = None - if isinstance(step, CobaltBatchStep): - # wait for batch step to submit successfully - return_code, out, err = self.task_manager.start_and_wait(cmd_list, step.cwd) - if return_code != 0: - raise LauncherError( - f"Cobalt qsub batch submission failed\n {out}\n {err}" - ) - if out: - step_id = parse_qsub_out(out) - logger.debug(f"Gleaned batch job id: {step_id} for {step.name}") - else: - # aprun doesn't direct output for us. - out, err = step.get_output_files() - - # pylint: disable-next=consider-using-with - output = open(out, "w+", encoding="utf-8") - # pylint: disable-next=consider-using-with - error = open(err, "w+", encoding="utf-8") - - task_id = self.task_manager.start_task( - cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() - ) - - # if batch submission did not successfully retrieve job ID - if not step_id and step.managed: - step_id = self._get_cobalt_step_id(step) - - self.step_mapping.add(step.name, step_id, task_id, step.managed) - return step_id - - def stop(self, step_name: str) -> StepInfo: - """Step a job step - - :param step_name: name of the job to stop - :type step_name: str - :return: update for job due to cancel - :rtype: StepInfo - """ - stepmap = self.step_mapping[step_name] - if stepmap.managed: - qdel_rc, _, err = qdel([str(stepmap.step_id)]) - if qdel_rc != 0: - logger.warning(f"Unable to cancel job step {step_name}\n {err}") - if stepmap.task_id: - self.task_manager.remove_task(str(stepmap.task_id)) - else: - if stepmap.task_id: - self.task_manager.remove_task(str(stepmap.task_id)) - - _, step_info = self.get_step_update([step_name])[0] - if not step_info: - raise LauncherError(f"Could not get step_info for job step {step_name}") - step_info.status = STATUS_CANCELLED # set status to cancelled instead of failed - return step_info - - def _get_cobalt_step_id(self, step: Step, interval: int = 2) -> str: - """Get the step_id of a step from qstat (rarely used) - - Parses cobalt qstat output by looking for the step name - """ - step_id = None - trials = CONFIG.wlm_trials - while trials > 0: - output, _ = qstat(["--header", "JobName:JobId", "-u", self.user]) - step_id = parse_cobalt_step_id(output, step.name) - if step_id: - break - else: - time.sleep(interval) - trials -= 1 - if not step_id: - raise LauncherError("Could not find id of launched job step") - return step_id - - def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: - """Get step updates for WLM managed jobs - - :param step_ids: list of job step ids - :type step_ids: list[str] - :return: list of updates for managed jobs - :rtype: list[StepInfo] - """ - args = ["--header", "JobId:State", "-u", self.user] - args.extend(step_ids) - qstat_out, _ = qstat(args) - - stats = [ - parse_cobalt_step_status(qstat_out, str(step_id)) for step_id in step_ids - ] - # create CobaltStepInfo objects to return - updates: t.List[StepInfo] = [] - for stat, _ in zip(stats, step_ids): - info = CobaltStepInfo(stat, None) # returncode not logged by Cobalt - - if info.status == STATUS_COMPLETED: - info.returncode = 0 - - updates.append(info) - return updates - - def __str__(self) -> str: - return "Cobalt" diff --git a/smartsim/_core/launcher/cobalt/cobaltParser.py b/smartsim/_core/launcher/cobalt/cobaltParser.py deleted file mode 100644 index c76509d36..000000000 --- a/smartsim/_core/launcher/cobalt/cobaltParser.py +++ /dev/null @@ -1,86 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -def parse_cobalt_step_status(output: str, step_id: str) -> str: - """ - Parse and return the status of a cobalt step - - :param output: output qstat - :type output: str - :param step_id: the id of the step to query - :type step_id: str - :rtype: str - """ - status = "NOTFOUND" - for line in output.split("\n"): - fields = line.split() - if len(fields) >= 2: - if fields[0] == step_id: - status = fields[1] - break - return status - - -def parse_cobalt_step_id(output: str, step_name: str) -> str: - """Parse and return the step id from a cobalt qstat command - - :param output: output qstat - :type output: str - :param step_name: the name of the step to query - :type step_name: str - :return: the step_id - :rtype: str - """ - step_id = "" - for line in output.split("\n"): - fields = line.split() - if len(fields) >= 2: - if fields[0] == step_name: - step_id = fields[1] - break - return step_id - - -def parse_qsub_out(output: str) -> str: - """ - Parse and return the step id from a cobalt qsub command - - :param output: output qstat - :type output: str - :return: the step_id - :rtype: str - """ - step_id = "" - for line in output.split("\n"): - try: - value = line.strip() - int(value) # if the cast works, return original string - step_id = value - break - except ValueError: - continue - return step_id diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index ea331023c..11d26b141 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index 61f0460f9..80000c22f 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/local/__init__.py b/smartsim/_core/launcher/local/__init__.py index bf6fd954c..efe03908e 100644 --- a/smartsim/_core/launcher/local/__init__.py +++ b/smartsim/_core/launcher/local/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py index fee058d16..96778ec0d 100644 --- a/smartsim/_core/launcher/local/local.py +++ b/smartsim/_core/launcher/local/local.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/lsf/__init__.py b/smartsim/_core/launcher/lsf/__init__.py index bf6fd954c..efe03908e 100644 --- a/smartsim/_core/launcher/lsf/__init__.py +++ b/smartsim/_core/launcher/lsf/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/lsf/lsfCommands.py b/smartsim/_core/launcher/lsf/lsfCommands.py index 99836fa7a..d6d0ee031 100644 --- a/smartsim/_core/launcher/lsf/lsfCommands.py +++ b/smartsim/_core/launcher/lsf/lsfCommands.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/lsf/lsfLauncher.py b/smartsim/_core/launcher/lsf/lsfLauncher.py index c2f432807..a8b6fafdb 100644 --- a/smartsim/_core/launcher/lsf/lsfLauncher.py +++ b/smartsim/_core/launcher/lsf/lsfLauncher.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/lsf/lsfParser.py b/smartsim/_core/launcher/lsf/lsfParser.py index fff49c57e..33837d2bd 100644 --- a/smartsim/_core/launcher/lsf/lsfParser.py +++ b/smartsim/_core/launcher/lsf/lsfParser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/pbs/__init__.py b/smartsim/_core/launcher/pbs/__init__.py index bf6fd954c..efe03908e 100644 --- a/smartsim/_core/launcher/pbs/__init__.py +++ b/smartsim/_core/launcher/pbs/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/pbs/pbsCommands.py b/smartsim/_core/launcher/pbs/pbsCommands.py index 0fdd06f7b..f738ef1f8 100644 --- a/smartsim/_core/launcher/pbs/pbsCommands.py +++ b/smartsim/_core/launcher/pbs/pbsCommands.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index 1b77ffd81..0b2f85e95 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/pbs/pbsParser.py b/smartsim/_core/launcher/pbs/pbsParser.py index 426166342..362577595 100644 --- a/smartsim/_core/launcher/pbs/pbsParser.py +++ b/smartsim/_core/launcher/pbs/pbsParser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/slurm/__init__.py b/smartsim/_core/launcher/slurm/__init__.py index bf6fd954c..efe03908e 100644 --- a/smartsim/_core/launcher/slurm/__init__.py +++ b/smartsim/_core/launcher/slurm/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/slurm/slurmCommands.py b/smartsim/_core/launcher/slurm/slurmCommands.py index ecf545b91..2e37f1d79 100644 --- a/smartsim/_core/launcher/slurm/slurmCommands.py +++ b/smartsim/_core/launcher/slurm/slurmCommands.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index cba8df4f1..e939a63db 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/slurm/slurmParser.py b/smartsim/_core/launcher/slurm/slurmParser.py index bfea06efb..ede687eb6 100644 --- a/smartsim/_core/launcher/slurm/slurmParser.py +++ b/smartsim/_core/launcher/slurm/slurmParser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/step/__init__.py b/smartsim/_core/launcher/step/__init__.py index 98dd1a921..663edb682 100644 --- a/smartsim/_core/launcher/step/__init__.py +++ b/smartsim/_core/launcher/step/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -25,7 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from .alpsStep import AprunStep -from .cobaltStep import CobaltBatchStep from .localStep import LocalStep from .lsfStep import BsubBatchStep, JsrunStep from .mpiStep import MpiexecStep, MpirunStep, OrterunStep diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index d675f703f..61ca5eee8 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -113,12 +113,6 @@ def _set_alloc(self) -> None: logger.debug( f"Running on PBS allocation {self.alloc} gleaned from user environment" ) - elif "COBALT_JOBID" in os.environ: - self.alloc = os.environ["COBALT_JOBID"] - logger.debug( - f"Running on Cobalt allocation {self.alloc} gleaned " - "from user environment" - ) else: raise AllocationError( "No allocation specified or found and not running in batch" diff --git a/smartsim/_core/launcher/step/cobaltStep.py b/smartsim/_core/launcher/step/cobaltStep.py deleted file mode 100644 index b224121e2..000000000 --- a/smartsim/_core/launcher/step/cobaltStep.py +++ /dev/null @@ -1,106 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import os -import stat -import typing as t - -from ....log import get_logger -from ....settings import CobaltBatchSettings -from .step import Step - -logger = get_logger(__name__) - - -class CobaltBatchStep(Step): - def __init__( - self, name: str, cwd: str, batch_settings: CobaltBatchSettings - ) -> None: - """Initialize a Cobalt qsub step - - :param name: name of the entity to launch - :type name: str - :param cwd: path to launch dir - :type cwd: str - :param batch_settings: batch settings for entity - :type batch_settings: CobaltBatchSettings - """ - super().__init__(name, cwd, batch_settings) - self.step_cmds: t.List[t.List[str]] = [] - self.managed = True - self.batch_settings = batch_settings - - def get_launch_cmd(self) -> t.List[str]: - """Get the launch command for the batch - - :return: launch command for the batch - :rtype: list[str] - """ - script = self._write_script() - return [self.batch_settings.batch_cmd, script] - - def add_to_batch(self, step: Step) -> None: - """Add a job step to this batch - - :param step: a job step instance e.g. SrunStep - :type step: Step - """ - launch_cmd = step.get_launch_cmd() - self.step_cmds.append(launch_cmd) - logger.debug(f"Added step command to batch for {step.name}") - - def _write_script(self) -> str: - """Write the batch script - - :return: batch script path after writing - :rtype: str - """ - batch_script = self.get_step_file(ending=".sh") - cobalt_debug = self.get_step_file(ending=".cobalt-debug") - output, error = self.get_output_files() - with open(batch_script, "w", encoding="utf-8") as script_file: - script_file.write("#!/bin/bash\n") - script_file.write(f"#COBALT -o {output}\n") - script_file.write(f"#COBALT -e {error}\n") - script_file.write(f"#COBALT --cwd {self.cwd}\n") - script_file.write(f"#COBALT --jobname {self.name}\n") - script_file.write(f"#COBALT --debuglog {cobalt_debug}\n") - - # add additional sbatch options - for opt in self.batch_settings.format_batch_args(): - script_file.write(f"#COBALT {opt}\n") - - for cmd in self.batch_settings.preamble: - script_file.write(f"{cmd}\n") - - for i, step_cmd in enumerate(self.step_cmds): - script_file.write("\n") - script_file.write(f"{' '.join((step_cmd))} &\n") - if i == len(self.step_cmds) - 1: - script_file.write("\n") - script_file.write("wait\n") - os.chmod(batch_script, stat.S_IXUSR | stat.S_IWUSR | stat.S_IRUSR) - return batch_script diff --git a/smartsim/_core/launcher/step/localStep.py b/smartsim/_core/launcher/step/localStep.py index 2f10bc79d..968152a41 100644 --- a/smartsim/_core/launcher/step/localStep.py +++ b/smartsim/_core/launcher/step/localStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/step/lsfStep.py b/smartsim/_core/launcher/step/lsfStep.py index 953ab9c45..1c88dadb8 100644 --- a/smartsim/_core/launcher/step/lsfStep.py +++ b/smartsim/_core/launcher/step/lsfStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 7971fb732..785d55e92 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -57,7 +57,7 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings) -> None: self._set_alloc() self.run_settings = run_settings - _supported_launchers = ["PBS", "COBALT", "SLURM", "LSB"] + _supported_launchers = ["PBS", "SLURM", "LSB"] @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: diff --git a/smartsim/_core/launcher/step/pbsStep.py b/smartsim/_core/launcher/step/pbsStep.py index 9218894f9..65dac3225 100644 --- a/smartsim/_core/launcher/step/pbsStep.py +++ b/smartsim/_core/launcher/step/pbsStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index cb0db483b..7baab891b 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index ebbdd074e..ddb95a850 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -93,7 +93,7 @@ def get_colocated_launch_script(self) -> str: ) makedirs(osp.dirname(script_path), exist_ok=True) - db_settings: t.Dict[str, str] = {} + db_settings = {} if isinstance(self.step_settings, RunSettings): db_settings = self.step_settings.colocated_db_settings or {} diff --git a/smartsim/_core/launcher/stepInfo.py b/smartsim/_core/launcher/stepInfo.py index b33dac5ec..56b5218fc 100644 --- a/smartsim/_core/launcher/stepInfo.py +++ b/smartsim/_core/launcher/stepInfo.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -195,42 +195,6 @@ def __init__( ) -class CobaltStepInfo(StepInfo): # cov-cobalt - @property - def mapping(self) -> t.Dict[str, str]: - return { - "running": STATUS_RUNNING, - "queued": STATUS_PAUSED, - "starting": STATUS_PAUSED, - "dep_hold": STATUS_PAUSED, - "user_hold": STATUS_PAUSED, - "admin_hold": STATUS_PAUSED, - "dep_fail": STATUS_FAILED, # unsure of this one - "terminating": STATUS_COMPLETED, - "killing": STATUS_COMPLETED, - "exiting": STATUS_COMPLETED, - } - - def __init__( - self, - status: str = "", - returncode: t.Optional[int] = None, - output: t.Optional[str] = None, - error: t.Optional[str] = None, - ) -> None: - if status == "NOTFOUND": - # returncode not logged by Cobalt - # if job has exited the queue then we consider it "completed" - # this should only be hit in the case when job exits abnormally fast - smartsim_status = "Completed" - returncode = 0 - else: - smartsim_status = self._get_smartsim_status(status) - super().__init__( - smartsim_status, status, returncode, output=output, error=error - ) - - class LSFBatchStepInfo(StepInfo): # cov-lsf @property def mapping(self) -> t.Dict[str, str]: diff --git a/smartsim/_core/launcher/stepMapping.py b/smartsim/_core/launcher/stepMapping.py index 665404b1b..15c93470f 100644 --- a/smartsim/_core/launcher/stepMapping.py +++ b/smartsim/_core/launcher/stepMapping.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/taskManager.py b/smartsim/_core/launcher/taskManager.py index 2ad84493f..84123944e 100644 --- a/smartsim/_core/launcher/taskManager.py +++ b/smartsim/_core/launcher/taskManager.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,12 +29,12 @@ import time import typing as t from subprocess import PIPE -from threading import RLock, Thread +from threading import RLock import psutil from ...error import LauncherError -from ...log import get_logger +from ...log import ContextThread, get_logger from ..utils.helpers import check_dev_log_level from .util.shell import execute_async_cmd, execute_cmd @@ -74,7 +74,7 @@ def start(self) -> None: The TaskManager is run as a daemon thread meaning that it will die when the main thread dies. """ - monitor = Thread(name="TaskManager", daemon=True, target=self.run) + monitor = ContextThread(name="TaskManager", daemon=True, target=self.run) monitor.start() def run(self) -> None: diff --git a/smartsim/_core/launcher/util/__init__.py b/smartsim/_core/launcher/util/__init__.py index bf6fd954c..efe03908e 100644 --- a/smartsim/_core/launcher/util/__init__.py +++ b/smartsim/_core/launcher/util/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/util/launcherUtil.py b/smartsim/_core/launcher/util/launcherUtil.py index 9fcd973e3..a24d69e49 100644 --- a/smartsim/_core/launcher/util/launcherUtil.py +++ b/smartsim/_core/launcher/util/launcherUtil.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/util/shell.py b/smartsim/_core/launcher/util/shell.py index 1fc243c15..c747bacbc 100644 --- a/smartsim/_core/launcher/util/shell.py +++ b/smartsim/_core/launcher/util/shell.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py index e1123321b..cb9395881 100644 --- a/smartsim/_core/utils/__init__.py +++ b/smartsim/_core/utils/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index 8d7edf722..27059e320 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/utils/network.py b/smartsim/_core/utils/network.py index f18be208e..69eeb3e1b 100644 --- a/smartsim/_core/utils/network.py +++ b/smartsim/_core/utils/network.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 6c592d0f3..3bcf1c1f2 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -177,6 +177,8 @@ def set_ml_model(db_model: DBModel, client: Client) -> None: outputs=db_model.outputs, ) else: + if db_model.model is None: + raise ValueError(f"No model attacted to {db_model.name}") client.set_model( name=db_model.name, model=db_model.model, @@ -203,7 +205,7 @@ def set_script(db_script: DBScript, client: Client) -> None: client.set_script_from_file( name=db_script.name, file=str(db_script.file), device=device ) - else: + elif db_script.script: if isinstance(db_script.script, str): client.set_script( name=db_script.name, script=db_script.script, device=device @@ -212,7 +214,8 @@ def set_script(db_script: DBScript, client: Client) -> None: client.set_function( name=db_script.name, function=db_script.script, device=device ) - + else: + raise ValueError(f"No script or file attached to {db_script.name}") except RedisReplyError as error: # pragma: no cover logger.error("Error while setting model on orchestrator.") raise error diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py index 75f9aef66..69840b838 100644 --- a/smartsim/_core/utils/serialize.py +++ b/smartsim/_core/utils/serialize.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -47,7 +47,7 @@ TStepLaunchMetaData = t.Tuple[ t.Optional[str], t.Optional[str], t.Optional[bool], str, str, Path ] -TELMON_SUBDIR: t.Final[str] = ".smartsim/telemetry" + MANIFEST_FILENAME: t.Final[str] = "manifest.json" _LOGGER = smartsim.log.get_logger(__name__) @@ -58,6 +58,7 @@ def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: return manifest.metadata.run_telemetry_subdirectory.mkdir(parents=True, exist_ok=True) + exp_out, exp_err = smartsim.log.get_exp_log_paths() new_run = { "run_id": manifest.metadata.run_id, @@ -81,12 +82,14 @@ def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: manifest_dict = { "schema info": { "schema_name": "entity manifest", - "version": "0.0.2", + "version": "0.0.3", }, "experiment": { "name": manifest.metadata.exp_name, "path": manifest.metadata.exp_path, "launcher": manifest.metadata.launcher_name, + "out_file": str(exp_out), + "err_file": str(exp_err), }, "runs": [new_run], } diff --git a/smartsim/database/__init__.py b/smartsim/database/__init__.py index f16cf7703..106f8e1e2 100644 --- a/smartsim/database/__init__.py +++ b/smartsim/database/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 07a1a1bfd..431cb43c5 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,7 +44,6 @@ from ..settings import ( AprunSettings, BsubBatchSettings, - CobaltBatchSettings, JsrunSettings, MpiexecSettings, MpirunSettings, @@ -64,7 +63,6 @@ "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], "pals": ["mpiexec"], - "cobalt": ["aprun", "mpirun", "mpiexec"], "lsf": ["jsrun"], "local": [""], } @@ -389,7 +387,7 @@ def set_cpus(self, num_cpus: int) -> None: :type num_cpus: int """ if self.batch: - if self.launcher in ["pbs", "cobalt"]: + if self.launcher == "pbs": if hasattr(self, "batch_settings") and self.batch_settings: if hasattr(self.batch_settings, "set_ncpus"): self.batch_settings.set_ncpus(num_cpus) @@ -575,7 +573,7 @@ def set_max_message_size(self, size: int = 1_073_741_824) -> None: """ self.set_db_conf("proto-max-bulk-len", str(size)) - def set_db_conf(self, key: str, value: t.Union[int, str]) -> None: + def set_db_conf(self, key: str, value: str) -> None: """Set any valid configuration at runtime without the need to restart the database. All configuration parameters that are set are immediately loaded by the database and @@ -938,17 +936,6 @@ def _fill_reserved(self) -> None: "chdir", "D", ] - self._reserved_batch_args[CobaltBatchSettings] = [ - "cwd", - "error", - "e", - "output", - "o", - "outputprefix", - "N", - "l", - "jobname", - ] self._reserved_batch_args[QsubBatchSettings] = ["e", "o", "N", "l"] self._reserved_run_args[JsrunSettings] = [ "chdir", diff --git a/smartsim/entity/__init__.py b/smartsim/entity/__init__.py index 4ec28f2d4..4566cd76f 100644 --- a/smartsim/entity/__init__.py +++ b/smartsim/entity/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 403984d16..9b67687f0 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index bebedb12c..0a495f066 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -33,7 +33,10 @@ __all__ = ["DBObject", "DBModel", "DBScript"] -class DBObject: +_DBObjectFuncT = t.TypeVar("_DBObjectFuncT", str, bytes) + + +class DBObject(t.Generic[_DBObjectFuncT]): """Base class for ML objects residing on DB. Should not be instantiated. """ @@ -41,14 +44,14 @@ class DBObject: def __init__( self, name: str, - func: t.Optional[str], + func: t.Optional[_DBObjectFuncT], file_path: t.Optional[str], device: t.Literal["CPU", "GPU"], devices_per_node: int, first_device: int, ) -> None: self.name = name - self.func = func + self.func: t.Optional[_DBObjectFuncT] = func self.file: t.Optional[Path] = ( None # Need to have this explicitly to check on it ) @@ -65,9 +68,7 @@ def devices(self) -> t.List[str]: @property def is_file(self) -> bool: - if self.func: - return False - return True + return not self.func @staticmethod def _check_tensor_args( @@ -153,7 +154,7 @@ def _check_devices( raise ValueError(msg) -class DBScript(DBObject): +class DBScript(DBObject[str]): def __init__( self, name: str, @@ -214,12 +215,12 @@ def __str__(self) -> str: return desc_str -class DBModel(DBObject): +class DBModel(DBObject[bytes]): def __init__( self, name: str, backend: str, - model: t.Optional[str] = None, + model: t.Optional[bytes] = None, model_file: t.Optional[str] = None, device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, @@ -276,7 +277,7 @@ def __init__( self.inputs, self.outputs = self._check_tensor_args(inputs, outputs) @property - def model(self) -> t.Union[str, None]: + def model(self) -> t.Optional[bytes]: return self.func def __str__(self) -> str: diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index 28ada31de..b30f82542 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -357,7 +357,7 @@ def add_ml_model( self, name: str, backend: str, - model: t.Optional[str] = None, + model: t.Optional[bytes] = None, model_path: t.Optional[str] = None, device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, @@ -422,6 +422,18 @@ def add_ml_model( inputs=inputs, outputs=outputs, ) + dupe = next( + ( + db_model.name + for ensemble_ml_model in self._db_models + if ensemble_ml_model.name == db_model.name + ), + None, + ) + if dupe: + raise SSUnsupportedError( + f'An ML Model with name "{db_model.name}" already exists' + ) self._db_models.append(db_model) for entity in self.models: self._extend_entity_db_models(entity, [db_model]) @@ -471,6 +483,18 @@ def add_script( devices_per_node=devices_per_node, first_device=first_device, ) + dupe = next( + ( + db_script.name + for ensemble_script in self._db_scripts + if ensemble_script.name == db_script.name + ), + None, + ) + if dupe: + raise SSUnsupportedError( + f'A Script with name "{db_script.name}" already exists' + ) self._db_scripts.append(db_script) for entity in self.models: self._extend_entity_db_scripts(entity, [db_script]) @@ -517,21 +541,78 @@ def add_function( devices_per_node=devices_per_node, first_device=first_device, ) + dupe = next( + ( + db_script.name + for ensemble_script in self._db_scripts + if ensemble_script.name == db_script.name + ), + None, + ) + if dupe: + raise SSUnsupportedError( + f'A Script with name "{db_script.name}" already exists' + ) self._db_scripts.append(db_script) for entity in self.models: self._extend_entity_db_scripts(entity, [db_script]) @staticmethod def _extend_entity_db_models(model: Model, db_models: t.List[DBModel]) -> None: - entity_db_models = [db_model.name for db_model in model.db_models] + """ + Ensures that the Machine Learning model names being added to the Ensemble + are unique. - for db_model in db_models: - if db_model.name not in entity_db_models: - model.add_ml_model_object(db_model) + This static method checks if the provided ML model names already exist in + the Ensemble. An SSUnsupportedError is raised if any duplicate names are + found. Otherwise, it appends the given list of DBModels to the Ensemble. + + :param model: SmartSim Model object. + :type model: Model + :param db_models: List of DBModels to append to the Ensemble. + :type db_models: t.List[DBModel] + """ + for add_ml_model in db_models: + dupe = next( + ( + db_model.name + for db_model in model.db_models + if db_model.name == add_ml_model.name + ), + None, + ) + if dupe: + raise SSUnsupportedError( + f'An ML Model with name "{add_ml_model.name}" already exists' + ) + model.add_ml_model_object(add_ml_model) @staticmethod def _extend_entity_db_scripts(model: Model, db_scripts: t.List[DBScript]) -> None: - entity_db_scripts = [db_script.name for db_script in model.db_scripts] - for db_script in db_scripts: - if not db_script.name in entity_db_scripts: - model.add_script_object(db_script) + """ + Ensures that the script/function names being added to the Ensemble are unique. + + This static method checks if the provided script/function names already exist + in the Ensemble. An SSUnsupportedError is raised if any duplicate names + are found. Otherwise, it appends the given list of DBScripts to the + Ensemble. + + :param model: SmartSim Model object. + :type model: Model + :param db_scripts: List of DBScripts to append to the Ensemble. + :type db_scripts: t.List[DBScript] + """ + for add_script in db_scripts: + dupe = next( + ( + add_script.name + for db_script in model.db_scripts + if db_script.name == add_script.name + ), + None, + ) + if dupe: + raise SSUnsupportedError( + f'A Script with name "{add_script.name}" already exists' + ) + model.add_script_object(add_script) diff --git a/smartsim/entity/entity.py b/smartsim/entity/entity.py index 0d126c907..46202ca6a 100644 --- a/smartsim/entity/entity.py +++ b/smartsim/entity/entity.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/entity/entityList.py b/smartsim/entity/entityList.py index 4eaf3faa0..6d958bda6 100644 --- a/smartsim/entity/entityList.py +++ b/smartsim/entity/entityList.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/entity/files.py b/smartsim/entity/files.py index 3aae9402b..9c282b94e 100644 --- a/smartsim/entity/files.py +++ b/smartsim/entity/files.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 6b97cbf2e..c7b8731c2 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -258,11 +258,11 @@ def colocate_db_uds( f"Invalid name for unix socket: {unix_socket}. Must only " "contain alphanumeric characters or . : _ - /" ) - - uds_options = { + uds_options: t.Dict[str, t.Union[int, str]] = { "unix_socket": unix_socket, "socket_permissions": socket_permissions, - "port": 0, # This is hardcoded to 0 as recommended by redis for UDS + # This is hardcoded to 0 as recommended by redis for UDS + "port": 0, } common_options = { @@ -332,9 +332,18 @@ def colocate_db_tcp( def _set_colocated_db_settings( self, - connection_options: t.Dict[str, t.Any], - common_options: t.Dict[str, t.Any], - **kwargs: t.Any, + connection_options: t.Mapping[str, t.Union[int, t.List[str], str]], + common_options: t.Dict[ + str, + t.Union[ + t.Union[t.Iterable[t.Union[int, t.Iterable[int]]], None], + bool, + int, + str, + None, + ], + ], + **kwargs: t.Union[int, None], ) -> None: """ Ingest the connection-specific options (UDS/TCP) and set the final settings @@ -357,21 +366,42 @@ def _set_colocated_db_settings( ) # TODO list which db settings can be extras + custom_pinning_ = t.cast( + t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]], + common_options.get("custom_pinning"), + ) + cpus_ = t.cast(int, common_options.get("cpus")) common_options["custom_pinning"] = self._create_pinning_string( - common_options["custom_pinning"], common_options["cpus"] + custom_pinning_, cpus_ ) - colo_db_config = {} + colo_db_config: t.Dict[ + str, + t.Union[ + bool, + int, + str, + None, + t.List[str], + t.Iterable[t.Union[int, t.Iterable[int]]], + t.List[DBModel], + t.List[DBScript], + t.Dict[str, t.Union[int, None]], + t.Dict[str, str], + ], + ] = {} colo_db_config.update(connection_options) colo_db_config.update(common_options) - # redisai arguments for inference settings - colo_db_config["rai_args"] = { + + redis_ai_temp = { "threads_per_queue": kwargs.get("threads_per_queue", None), "inter_op_parallelism": kwargs.get("inter_op_parallelism", None), "intra_op_parallelism": kwargs.get("intra_op_parallelism", None), } + # redisai arguments for inference settings + colo_db_config["rai_args"] = redis_ai_temp colo_db_config["extra_db_args"] = { - k: str(v) for k, v in kwargs.items() if k not in colo_db_config["rai_args"] + k: str(v) for k, v in kwargs.items() if k not in redis_ai_temp } self._check_db_objects_colo() @@ -455,7 +485,7 @@ def add_ml_model( self, name: str, backend: str, - model: t.Optional[str] = None, + model: t.Optional[bytes] = None, model_path: t.Optional[str] = None, device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, diff --git a/smartsim/entity/strategies.py b/smartsim/entity/strategies.py index e9db30c8f..2af88b58e 100644 --- a/smartsim/entity/strategies.py +++ b/smartsim/entity/strategies.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/error/__init__.py b/smartsim/error/__init__.py index a04f5d91e..4268905e6 100644 --- a/smartsim/error/__init__.py +++ b/smartsim/error/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/error/errors.py b/smartsim/error/errors.py index d71ae3f71..9a6954907 100644 --- a/smartsim/error/errors.py +++ b/smartsim/error/errors.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 9de33419a..9fcc7b13e 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,18 +31,30 @@ from tabulate import tabulate +from smartsim.error.errors import SSUnsupportedError + from ._core import Controller, Generator, Manifest from ._core.utils import init_default from .database import Orchestrator from .entity import Ensemble, Model, SmartSimEntity from .error import SmartSimError -from .log import get_logger +from .log import ctx_exp_path, get_logger, method_contextualizer from .settings import Container, base, settings from .wlm import detect_launcher logger = get_logger(__name__) +def _exp_path_map(exp: "Experiment") -> str: + """Mapping function for use by method contextualizer to place the path of + the currently-executing experiment into context for log enrichment""" + return exp.exp_path + + +_contextualize = method_contextualizer(ctx_exp_path, _exp_path_map) + + +# pylint: disable=no-self-use class Experiment: """Experiments are the Python user interface for SmartSim. @@ -110,7 +122,7 @@ def __init__( :param exp_path: path to location of ``Experiment`` directory if generated :type exp_path: str, optional :param launcher: type of launcher being used, options are "slurm", "pbs", - "cobalt", "lsf", or "local". If set to "auto", + "lsf", or "local". If set to "auto", an attempt will be made to find an available launcher on the system. Defaults to "local" @@ -123,15 +135,18 @@ def __init__( if not osp.isdir(osp.abspath(exp_path)): raise NotADirectoryError("Experiment path provided does not exist") exp_path = osp.abspath(exp_path) - self.exp_path = init_default(osp.join(getcwd(), name), exp_path, str) + self.exp_path: str = init_default(osp.join(getcwd(), name), exp_path, str) if launcher == "auto": launcher = detect_launcher() + if launcher == "cobalt": + raise SSUnsupportedError("Cobalt launcher is no longer supported.") self._control = Controller(launcher=launcher) self._launcher = launcher.lower() self.db_identifiers: t.Set[str] = set() + @_contextualize def start( self, *args: t.Any, @@ -205,6 +220,7 @@ def start( logger.error(e) raise + @_contextualize def stop(self, *args: t.Any) -> None: """Stop specific instances launched by this ``Experiment`` @@ -241,6 +257,7 @@ def stop(self, *args: t.Any) -> None: logger.error(e) raise + @_contextualize def generate( self, *args: t.Any, @@ -278,6 +295,7 @@ def generate( logger.error(e) raise + @_contextualize def poll( self, interval: int = 10, verbose: bool = True, kill_on_interrupt: bool = True ) -> None: @@ -321,6 +339,7 @@ def poll( logger.error(e) raise + @_contextualize def finished(self, entity: SmartSimEntity) -> bool: """Query if a job has completed. @@ -344,6 +363,7 @@ def finished(self, entity: SmartSimEntity) -> bool: logger.error(e) raise + @_contextualize def get_status(self, *args: t.Any) -> t.List[str]: """Query the status of launched instances @@ -382,8 +402,9 @@ def get_status(self, *args: t.Any) -> t.List[str]: logger.error(e) raise - @staticmethod + @_contextualize def create_ensemble( + self, name: str, params: t.Optional[t.Dict[str, t.Any]] = None, batch_settings: t.Optional[base.BatchSettings] = None, @@ -456,8 +477,9 @@ def create_ensemble( logger.error(e) raise - @staticmethod + @_contextualize def create_model( + self, name: str, run_settings: base.RunSettings, params: t.Optional[t.Dict[str, t.Any]] = None, @@ -553,7 +575,6 @@ def create_model( """ path = init_default(getcwd(), path, str) - # mcb if path is None: path = getcwd() if params is None: @@ -570,6 +591,7 @@ def create_model( logger.error(e) raise + @_contextualize def create_run_settings( self, exe: str, @@ -634,6 +656,7 @@ class in SmartSim. If found, the class corresponding logger.error(e) raise + @_contextualize def create_batch_settings( self, nodes: int = 1, @@ -694,6 +717,7 @@ def create_batch_settings( logger.error(e) raise + @_contextualize def create_database( self, port: int = 6379, @@ -777,6 +801,7 @@ def create_database( **kwargs, ) + @_contextualize def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator: """Reconnect to a running ``Orchestrator`` @@ -797,6 +822,7 @@ def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator: logger.error(e) raise + @_contextualize def summary(self, style: str = "github") -> str: """Return a summary of the ``Experiment`` diff --git a/smartsim/log.py b/smartsim/log.py index baf54f068..55cb88afb 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -23,27 +23,48 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import functools import logging -import os +import pathlib import sys +import threading import typing as t +from contextvars import ContextVar, copy_context import coloredlogs +from smartsim._core.config import CONFIG + # constants DEFAULT_DATE_FORMAT: t.Final[str] = "%H:%M:%S" DEFAULT_LOG_FORMAT: t.Final[str] = ( "%(asctime)s %(hostname)s %(name)s[%(process)d] %(levelname)s %(message)s" ) +EXPERIMENT_LOG_FORMAT = DEFAULT_LOG_FORMAT.replace("s[%", "s {%(exp_path)s} [%") # configure colored loggs coloredlogs.DEFAULT_DATE_FORMAT = DEFAULT_DATE_FORMAT coloredlogs.DEFAULT_LOG_FORMAT = DEFAULT_LOG_FORMAT +# create context vars used by loggers +ctx_exp_path = ContextVar("exp_path", default="") + + +# Generic types for method contextualizers +_T = t.TypeVar("_T") +_RT = t.TypeVar("_RT") +_ContextT = t.TypeVar("_ContextT") + +if t.TYPE_CHECKING: + from typing_extensions import Concatenate, ParamSpec -def _get_log_level() -> str: - """Get the logging level based on environment variable - SMARTSIM_LOG_LEVEL. If not set, default to info. + _PR = ParamSpec("_PR") + + +def _translate_log_level(user_log_level: str = "info") -> str: + """Translate value of CONFIG.log_level to one + accepted as ``level`` option by Python's logging module. Logging levels - quiet: Just shows errors and warnings @@ -52,22 +73,106 @@ def _get_log_level() -> str: - developer: Shows everything happening during execution extremely verbose logging. - :return: Log level for coloredlogs + :param user_log_level: log level specified by user, defaults to info + :type user_log_level: str + :returns: Log level for coloredlogs :rtype: str """ - log_level = os.environ.get("SMARTSIM_LOG_LEVEL", "info").lower() - if log_level == "quiet": + user_log_level = user_log_level.lower() + if user_log_level in ["info", "debug", "warning"]: + return user_log_level + if user_log_level == "quiet": return "warning" - if log_level == "info": - return "info" - if log_level == "debug": - return "debug" # extremely verbose logging used internally - if log_level == "developer": + if user_log_level == "developer": return "debug" return "info" +def get_exp_log_paths() -> t.Tuple[t.Optional[pathlib.Path], t.Optional[pathlib.Path]]: + """Returns the output and error file paths to experiment logs. + Returns None for both paths if experiment context is unavailable. + + :returns: 2-tuple of paths to experiment logs in form (output_path, error_path) + if telemetry is enabled, a 2-tuple of None otherwise + :rtype: Tuple[pathlib.Path | None, pathlib.Path | None] + """ + default_paths = None, None + + if not CONFIG.telemetry_enabled: + return default_paths + + if _exp_path := ctx_exp_path.get(): + file_out = pathlib.Path(_exp_path) / CONFIG.telemetry_subdir / "smartsim.out" + file_err = pathlib.Path(_exp_path) / CONFIG.telemetry_subdir / "smartsim.err" + return file_out, file_err + + return default_paths + + +class ContextThread(threading.Thread): + """Thread that ensures the context vars of the caller are available""" + + def run(self) -> None: + """Execute a thread on a copy of the current thread context""" + ctx = copy_context() + return ctx.run(super().run) + + +class ContextInjectingLogFilter(logging.Filter): + """Filter that performs enrichment of a log record by adding context + information about the experiment being executed""" + + def filter(self, record: logging.LogRecord) -> bool: + """Enrich log records with active experiment context + + :param record: the record to evaluate for filtering + :type record: logging.LogRecord + :returns: always True + :rtype: bool + """ + record.exp_path = ctx_exp_path.get() + return True + + +class ContextAwareLogger(logging.Logger): + """A logger customized to automatically write experiment logs to a + dynamic target directory by inspecting the value of a context var""" + + def __init__(self, name: str, level: t.Union[int, str] = 0) -> None: + super().__init__(name, level) + self.addFilter(ContextInjectingLogFilter(name="exp-ctx-log-filter")) + + def _log( + self, + level: int, + msg: object, + args: t.Any, + exc_info: t.Optional[t.Any] = None, + extra: t.Optional[t.Any] = None, + stack_info: bool = False, + stacklevel: int = 1, + ) -> None: + """Automatically attach file handlers if contextual information is found""" + file_out, file_err = get_exp_log_paths() + + if not all([file_out, file_err]): + super()._log(level, msg, args, exc_info, extra, stack_info, stacklevel) + return + + _lvl = logging.getLevelName(self.level) + fmt = EXPERIMENT_LOG_FORMAT + + low_pass = LowPassFilter(_lvl) + h_out = log_to_exp_file(str(file_out), self, _lvl, fmt, low_pass) + h_err = log_to_exp_file(str(file_err), self, "WARN", fmt) + + super()._log(level, msg, args, exc_info, extra, stack_info, stacklevel) + + for handler in [h_out, h_err]: + self.removeHandler(handler) + + def get_logger( name: str, log_level: t.Optional[str] = None, fmt: t.Optional[str] = None ) -> logging.Logger: @@ -99,19 +204,46 @@ def get_logger( """ # if name is None, then logger is the root logger # if not root logger, get the name of file without prefix. - user_log_level = _get_log_level() + user_log_level = CONFIG.log_level if user_log_level != "developer": name = "SmartSim" + logging.setLoggerClass(ContextAwareLogger) logger = logging.getLogger(name) if log_level: logger.setLevel(log_level) else: - log_level = user_log_level + log_level = _translate_log_level(user_log_level) coloredlogs.install(level=log_level, logger=logger, fmt=fmt, stream=sys.stdout) return logger +class LowPassFilter(logging.Filter): + """A filter that passes all records below a specified level""" + + def __init__(self, maximum_level: str = "INFO"): + """Create a low-pass log filter allowing messages below a specific log level + + :param maximum_level: The maximum log level to be passed by the filter + :type maximum_level: str + """ + super().__init__() + self.max = maximum_level + + def filter(self, record: logging.LogRecord) -> bool: + """Filter log records; pass those less than or equal to the maximum level + + :param record: the record to evaluate for filtering + :type record: logging.LogRecord + :returns: True if record level passes filter, False otherwise + :rtype: bool + """ + # If a string representation of the level is passed in, + # the corresponding numeric value is returned. + level_no: int = logging.getLevelName(self.max) + return record.levelno <= level_no + + def log_to_file(filename: str, log_level: str = "debug") -> None: """Installs a second filestream handler to the root logger, allowing subsequent logging calls to be sent to filename. @@ -122,10 +254,106 @@ def log_to_file(filename: str, log_level: str = "debug") -> None: :param log_level: as defined in get_logger. Can be specified to allow the file to store more or less verbose logging information. - :type log_level: int | str + :type log_level: str """ logger = logging.getLogger("SmartSim") stream = open( # pylint: disable=consider-using-with filename, "w+", encoding="utf-8" ) coloredlogs.install(stream=stream, logger=logger, level=log_level) + + +def log_to_exp_file( + filename: str, + logger: logging.Logger, + log_level: str = "warn", + fmt: t.Optional[str] = EXPERIMENT_LOG_FORMAT, + log_filter: t.Optional[logging.Filter] = None, +) -> logging.Handler: + """Installs a second filestream handler to the root logger, + allowing subsequent logging calls to be sent to filename. + + :param filename: the name of the desired log file. + :type filename: str + :param log_level: as defined in get_logger. Can be specified + to allow the file to store more or less verbose + logging information. + :type log_level: int | str + :param logger: an existing logger to add the handler to + :type logger: (optional) logging.Logger + :param fmt: a log format for the handler (otherwise, EXPERIMENT_LOG_FORMAT) + :type fmt: (optional) str + :param log_filter: log filter to attach to handler + :type log_filter: (optional) logging.Filter + :return: logging.Handler + :rtype: logging.Handler + """ + # ensure logs are written even if specified dir doesn't exist + log_path = pathlib.Path(filename) + if not log_path.parent.exists(): + log_path.parent.mkdir(parents=True, exist_ok=True) + + handler = logging.FileHandler(filename, mode="a+", encoding="utf-8") + + if log_filter: + handler.addFilter(log_filter) + + formatter = logging.Formatter(fmt=fmt, datefmt=DEFAULT_DATE_FORMAT) + + handler.setFormatter(formatter) + handler.setLevel(log_level.upper()) + + logger.addHandler(handler) + return handler + + +def method_contextualizer( + ctx_var: ContextVar[_ContextT], + ctx_map: t.Callable[[_T], _ContextT], +) -> """t.Callable[ + [t.Callable[Concatenate[_T, _PR], _RT]], + t.Callable[Concatenate[_T, _PR], _RT], +]""": + """Parameterized-decorator factory that enables a target value + to be placed into global context prior to execution of the + decorated method. + Usage Note: the use of `self` below requires that the decorated function is passed + the object containing a value that will be modified in the context. `ctx_map` + must accept an instance of matching type. + + :param ctx_var: The ContextVar that will be modified + :type ctx_var: ContextVar + :param ctx_map: A function that returns the value to be set to ctx_var + :type ctx_map: t.Callable[[_T], _ContextT]""" + + def _contextualize( + fn: "t.Callable[Concatenate[_T, _PR], _RT]", / + ) -> "t.Callable[Concatenate[_T, _PR], _RT]": + """Executes the decorated method in a cloned context and ensures + `ctx_var` is updated to the value returned by `ctx_map` prior to + calling the decorated method""" + + @functools.wraps(fn) + def _contextual( + self: _T, + *args: "_PR.args", + **kwargs: "_PR.kwargs", + ) -> _RT: + """A decorator operator that runs the decorated method in a new + context with the desired contextual information modified.""" + + def _ctx_modifier() -> _RT: + """Helper to simplify calling the target method with the + modified value set in `ctx_var`""" + ctx_val = ctx_map(self) + token = ctx_var.set(ctx_val) + result = fn(self, *args, **kwargs) + ctx_var.reset(token) + return result + + ctx = copy_context() + return ctx.run(_ctx_modifier) + + return _contextual + + return _contextualize diff --git a/smartsim/ml/__init__.py b/smartsim/ml/__init__.py index 84fd06b57..eb74c5957 100644 --- a/smartsim/ml/__init__.py +++ b/smartsim/ml/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/ml/data.py b/smartsim/ml/data.py index 3dfca9f0c..4cdc27c06 100644 --- a/smartsim/ml/data.py +++ b/smartsim/ml/data.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -35,6 +35,9 @@ from ..error import SSInternalError from ..log import get_logger +if t.TYPE_CHECKING: + import numpy.typing as npt + logger = get_logger(__name__) @@ -118,7 +121,7 @@ def download(self, client: Client) -> None: if "target_name" in field_names: self.target_name = info_ds.get_meta_strings("target_name")[0] if "num_classes" in field_names: - self.num_classes = info_ds.get_meta_scalars("num_classes")[0] + self.num_classes = int(info_ds.get_meta_scalars("num_classes")[0]) def __repr__(self) -> str: strings = ["DataInfo object"] @@ -311,8 +314,8 @@ def __init__( self.address = address self.cluster = cluster self.verbose = verbose - self.samples = None - self.targets = None + self.samples: t.Optional["npt.NDArray[t.Any]"] = None + self.targets: t.Optional["npt.NDArray[t.Any]"] = None self.num_samples = 0 self.indices = np.arange(0) self.shuffle = shuffle @@ -460,14 +463,20 @@ def _add_samples(self, indices: t.List[int]) -> None: if self.samples is not None: for dataset in datasets: self.samples = np.concatenate( - (self.samples, dataset.get_tensor(self.sample_name)) + ( + t.cast("npt.NDArray[t.Any]", self.samples), + dataset.get_tensor(self.sample_name), + ) ) if self.need_targets: self.targets = np.concatenate( - (self.targets, dataset.get_tensor(self.target_name)) + ( + t.cast("npt.NDArray[t.Any]", self.targets), + dataset.get_tensor(self.target_name), + ) ) - self.num_samples = self.samples.shape[0] + self.num_samples = t.cast("npt.NDArray[t.Any]", self.samples).shape[0] self.indices = np.arange(self.num_samples) self.log(f"New dataset size: {self.num_samples}, batches: {len(self)}") @@ -496,8 +505,8 @@ def update_data(self) -> None: np.random.shuffle(self.indices) def _data_generation( - self, indices: np.ndarray # type: ignore[type-arg] - ) -> t.Tuple[np.ndarray, np.ndarray]: # type: ignore[type-arg] + self, indices: "npt.NDArray[t.Any]" + ) -> t.Tuple["npt.NDArray[t.Any]", "npt.NDArray[t.Any]"]: # Initialization if self.samples is None: raise ValueError("Samples have not been initialized") @@ -505,10 +514,10 @@ def _data_generation( xval = self.samples[indices] if self.need_targets: - yval = self.targets[indices] + yval = t.cast("npt.NDArray[t.Any]", self.targets)[indices] elif self.autoencoding: yval = xval else: - return xval + return xval # type: ignore[no-any-return] return xval, yval diff --git a/smartsim/ml/tf/__init__.py b/smartsim/ml/tf/__init__.py index 2f6646dbd..eb3cb565e 100644 --- a/smartsim/ml/tf/__init__.py +++ b/smartsim/ml/tf/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/ml/tf/data.py b/smartsim/ml/tf/data.py index ae0b9aadd..ffc969551 100644 --- a/smartsim/ml/tf/data.py +++ b/smartsim/ml/tf/data.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,6 +31,9 @@ from smartsim.ml import DataDownloader +if t.TYPE_CHECKING: + import numpy.typing as npt + class _TFDataGenerationCommon(DataDownloader, keras.utils.Sequence): def __getitem__( @@ -60,7 +63,9 @@ def on_epoch_end(self) -> None: if self.shuffle: np.random.shuffle(self.indices) - def _data_generation(self, indices: np.ndarray) -> t.Tuple[np.ndarray, np.ndarray]: # type: ignore[type-arg] + def _data_generation( + self, indices: "npt.NDArray[t.Any]" + ) -> t.Tuple["npt.NDArray[t.Any]", "npt.NDArray[t.Any]"]: # Initialization if self.samples is None: raise ValueError("No samples loaded for data generation") @@ -68,13 +73,13 @@ def _data_generation(self, indices: np.ndarray) -> t.Tuple[np.ndarray, np.ndarra xval = self.samples[indices] if self.need_targets: - yval = self.targets[indices] + yval = t.cast("npt.NDArray[t.Any]", self.targets)[indices] if self.num_classes is not None: yval = keras.utils.to_categorical(yval, num_classes=self.num_classes) elif self.autoencoding: yval = xval else: - return xval + return xval # type: ignore[no-any-return] return xval, yval diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index c8018ac32..69c8e2580 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/ml/torch/__init__.py b/smartsim/ml/torch/__init__.py index b90a6ffdb..fcc0f2a1f 100644 --- a/smartsim/ml/torch/__init__.py +++ b/smartsim/ml/torch/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/ml/torch/data.py b/smartsim/ml/torch/data.py index 166a29e05..c6a8e6eac 100644 --- a/smartsim/ml/torch/data.py +++ b/smartsim/ml/torch/data.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/py.typed b/smartsim/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/smartsim/servertype.py b/smartsim/servertype.py index a83149c23..06d0bc8e5 100644 --- a/smartsim/servertype.py +++ b/smartsim/servertype.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/__init__.py b/smartsim/settings/__init__.py index 542aeab1d..d417c9ef8 100644 --- a/smartsim/settings/__init__.py +++ b/smartsim/settings/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,7 +26,6 @@ from .alpsSettings import AprunSettings from .base import RunSettings, SettingsBase -from .cobaltSettings import CobaltBatchSettings from .containers import Container, Singularity from .lsfSettings import BsubBatchSettings, JsrunSettings from .mpiSettings import MpiexecSettings, MpirunSettings, OrterunSettings @@ -36,7 +35,6 @@ __all__ = [ "AprunSettings", - "CobaltBatchSettings", "BsubBatchSettings", "JsrunSettings", "MpirunSettings", diff --git a/smartsim/settings/alpsSettings.py b/smartsim/settings/alpsSettings.py index b36c3d333..5357312a5 100644 --- a/smartsim/settings/alpsSettings.py +++ b/smartsim/settings/alpsSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -43,8 +43,7 @@ def __init__( ): """Settings to run job with ``aprun`` command - ``AprunSettings`` can be used for both the `pbs` and `cobalt` - launchers. + ``AprunSettings`` can be used for the `pbs` launcher. :param exe: executable :type exe: str diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index a6df4eed4..284d435c0 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -1,5 +1,5 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,6 +30,7 @@ from smartsim.settings.containers import Container from .._core.utils.helpers import expand_exe_path, fmt_dict, is_valid_cmd +from ..entity.dbobject import DBModel, DBScript from ..log import get_logger logger = get_logger(__name__) @@ -96,7 +97,23 @@ def __init__( self.container = container self._run_command = run_command self.in_batch = False - self.colocated_db_settings: t.Optional[t.Dict[str, str]] = None + self.colocated_db_settings: t.Optional[ + t.Dict[ + str, + t.Union[ + bool, + int, + str, + None, + t.List[str], + t.Iterable[t.Union[int, t.Iterable[int]]], + t.List[DBModel], + t.List[DBScript], + t.Dict[str, t.Union[int, None]], + t.Dict[str, str], + ], + ] + ] = None @property def exe_args(self) -> t.Union[str, t.List[str]]: diff --git a/smartsim/settings/cobaltSettings.py b/smartsim/settings/cobaltSettings.py deleted file mode 100644 index 5a0e07b40..000000000 --- a/smartsim/settings/cobaltSettings.py +++ /dev/null @@ -1,171 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import typing as t - -from .base import BatchSettings - - -class CobaltBatchSettings(BatchSettings): - def __init__( - self, - nodes: t.Optional[int] = None, - time: str = "", - queue: t.Optional[str] = None, - account: t.Optional[str] = None, - batch_args: t.Optional[t.Dict[str, t.Optional[str]]] = None, - **kwargs: t.Any, - ) -> None: - """Specify settings for a Cobalt ``qsub`` batch launch - - If the argument doesn't have a parameter, put None - as the value. e.g. {'exclusive': None} - - Initialization values provided (nodes, time, account) - will overwrite the same arguments in ``batch_args`` if present - - :param nodes: number of nodes, defaults to None - :type nodes: int, optional - :param time: walltime for job, e.g. "10:00:00" for 10 hours, - defaults to empty str - :type time: str, optional - :param queue: queue to launch job in, defaults to None - :type queue: str, optional - :param account: account for job, defaults to None - :type account: str, optional - :param batch_args: extra batch arguments, defaults to None - :type batch_args: dict[str, str], optional - """ - super().__init__( - "qsub", - batch_args=batch_args, - nodes=nodes, - account=account, - queue=queue, - time=time, - **kwargs, - ) - - def set_walltime(self, walltime: str) -> None: - """Set the walltime of the job - - format = "HH:MM:SS" - - Cobalt walltime can also be specified with number - of minutes. - - :param walltime: wall time - :type walltime: str - """ - # TODO check for formatting errors here - # TODO catch existing "t" in batch_args - if walltime: - self.batch_args["time"] = walltime - - def set_nodes(self, num_nodes: int) -> None: - """Set the number of nodes for this batch job - - :param num_nodes: number of nodes - :type num_nodes: int - """ - # TODO catch existing "n" in batch_args - if num_nodes: - self.batch_args["nodecount"] = str(int(num_nodes)) - - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: - """Specify the hostlist for this job - - :param host_list: hosts to launch on - :type host_list: str | list[str] - :raises TypeError: if not str or list of str - """ - if isinstance(host_list, str): - host_list = [host_list.strip()] - if not isinstance(host_list, list): - raise TypeError("host_list argument must be a list of strings") - if not all(isinstance(host, str) for host in host_list): - raise TypeError("host_list argument must be list of strings") - hosts = ",".join(host_list) - self.batch_args["attrs"] = f"location={hosts}" - - def set_tasks(self, num_tasks: int) -> None: - """Set total number of processes to start - - :param num_tasks: number of processes - :type num_tasks: int - """ - self.batch_args["proccount"] = str(int(num_tasks)) - - def set_queue(self, queue: str) -> None: - """Set the queue for the batch job - - :param queue: queue name - :type queue: str - """ - # TODO catch existing "q" in batch args - if queue: - self.batch_args["queue"] = str(queue) - - def set_account(self, account: str) -> None: - """Set the account for this batch job - - :param acct: account id - :type acct: str - """ - # TODO catch existing "A" in batch_args - if account: - self.batch_args["project"] = account - - def format_batch_args(self) -> t.List[str]: - """Get the formatted batch arguments for a preview - - :return: list of batch arguments for Sbatch - :rtype: list[str] - """ - restricted = [ - "o", - "output", # output is determined by interface - "O", - "outputprefix", # step name is output prefix - "e", - "error", # error is determined by interface - "cwd", # cwd is determined by interface - "jobname", # step name is jobname - ] - opts = [] - for opt, value in self.batch_args.items(): - if opt not in restricted: - # attach "-" prefix if argument is 1 character otherwise "--" - short_arg = bool(len(str(opt)) == 1) - prefix = "-" if short_arg else "--" - if not value: - opts += [prefix + opt] - else: - if short_arg: - opts += [prefix + opt, str(value)] - else: - opts += [" ".join((prefix + opt, str(value)))] - return opts diff --git a/smartsim/settings/containers.py b/smartsim/settings/containers.py index 6d5a72f80..bdba1ce88 100644 --- a/smartsim/settings/containers.py +++ b/smartsim/settings/containers.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/lsfSettings.py b/smartsim/settings/lsfSettings.py index 47fe91802..32902c8c6 100644 --- a/smartsim/settings/lsfSettings.py +++ b/smartsim/settings/lsfSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -97,7 +97,7 @@ def set_cpus_per_rs(self, cpus_per_rs: int) -> None: :type cpus_per_rs: int or str """ if self.colocated_db_settings: - db_cpus = int(self.colocated_db_settings.get("db_cpus", 0)) + db_cpus = int(t.cast(int, self.colocated_db_settings.get("db_cpus", 0))) if not db_cpus: raise ValueError("db_cpus must be configured on colocated_db_settings") diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py index 5b6b520e3..ce132bcc5 100644 --- a/smartsim/settings/mpiSettings.py +++ b/smartsim/settings/mpiSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/mpirunSettings.py b/smartsim/settings/mpirunSettings.py index b290e2355..994d62bba 100644 --- a/smartsim/settings/mpirunSettings.py +++ b/smartsim/settings/mpirunSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index bcfee1ff1..e43cd9466 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 0a4b0868a..19a58b11c 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index b09286e8c..6e6172507 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +31,6 @@ from ..settings import ( AprunSettings, BsubBatchSettings, - CobaltBatchSettings, Container, JsrunSettings, MpiexecSettings, @@ -81,7 +80,6 @@ def create_batch_settings( """ # all supported batch class implementations by_launcher: t.Dict[str, t.Callable[..., base.BatchSettings]] = { - "cobalt": CobaltBatchSettings, "pbs": QsubBatchSettings, "slurm": SbatchSettings, "lsf": BsubBatchSettings, @@ -164,7 +162,6 @@ def create_run_settings( "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], "pals": ["mpiexec"], - "cobalt": ["aprun", "mpirun", "mpiexec"], "lsf": ["jsrun", "mpirun", "mpiexec"], "local": [""], } diff --git a/smartsim/settings/slurmSettings.py b/smartsim/settings/slurmSettings.py index 8da8659e1..935a8df39 100644 --- a/smartsim/settings/slurmSettings.py +++ b/smartsim/settings/slurmSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -256,13 +256,9 @@ def _fmt_walltime(hours: int, minutes: int, seconds: int) -> str: :param seconds: number of seconds to run job :type seconds: int :returns: Formatted walltime - :rtype + :rtype: str """ - delta = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) - fmt_str = str(delta) - if delta.seconds // 3600 < 10: - fmt_str = "0" + fmt_str - return fmt_str + return fmt_walltime(hours, minutes, seconds) def set_walltime(self, walltime: str) -> None: """Set the walltime of the job @@ -390,6 +386,27 @@ def format_comma_sep_env_vars(self) -> t.Tuple[str, t.List[str]]: return fmt_exported_env, compound_env +def fmt_walltime(hours: int, minutes: int, seconds: int) -> str: + """Helper function walltime format conversion + + Converts time to format HH:MM:SS + + :param hours: number of hours to run job + :type hours: int + :param minutes: number of minutes to run job + :type minutes: int + :param seconds: number of seconds to run job + :type seconds: int + :returns: Formatted walltime + :rtype: str + """ + delta = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) + fmt_str = str(delta) + if delta.seconds // 3600 < 10: + fmt_str = "0" + fmt_str + return fmt_str + + class SbatchSettings(BatchSettings): def __init__( self, diff --git a/smartsim/slurm.py b/smartsim/slurm.py index 105800a14..6a32d0213 100644 --- a/smartsim/slurm.py +++ b/smartsim/slurm.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/status.py b/smartsim/status.py index 74d440b8e..409ec8c1a 100644 --- a/smartsim/status.py +++ b/smartsim/status.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/wlm/__init__.py b/smartsim/wlm/__init__.py index d7dd298be..3a82a81e5 100644 --- a/smartsim/wlm/__init__.py +++ b/smartsim/wlm/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -36,7 +36,7 @@ def detect_launcher() -> str: """Detect available launcher.""" - # Precedence: PBS, Cobalt, LSF, Slurm, local + # Precedence: PBS, LSF, Slurm, local if which("qsub") and which("qstat") and which("qdel"): qsub_version = run( ["qsub", "--version"], @@ -47,8 +47,6 @@ def detect_launcher() -> str: ) if "pbs" in (qsub_version.stdout).lower(): return "pbs" - if "cobalt" in (qsub_version.stdout).lower(): - return "cobalt" if all( [which("bsub"), which("jsrun"), which("jslist"), which("bjobs"), which("bkill")] ): @@ -66,9 +64,7 @@ def detect_launcher() -> str: ): return "slurm" # Systems like ThetaGPU don't have - # Cobalt or PBS on compute nodes - if "COBALT_JOBID" in os.environ: - return "cobalt" + # PBS on compute nodes if "PBS_JOBID" in os.environ: return "pbs" return "local" diff --git a/smartsim/wlm/pbs.py b/smartsim/wlm/pbs.py index 995ba5fc9..eda5baf24 100644 --- a/smartsim/wlm/pbs.py +++ b/smartsim/wlm/pbs.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/wlm/slurm.py b/smartsim/wlm/slurm.py index ba46fb64c..9308eea98 100644 --- a/smartsim/wlm/slurm.py +++ b/smartsim/wlm/slurm.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -39,6 +39,7 @@ SSReservedKeywordError, ) from ..log import get_logger +from ..settings.slurmSettings import fmt_walltime logger = get_logger(__name__) @@ -248,7 +249,7 @@ def _get_alloc_cmd( "SmartSim", ] if time: - salloc_args.extend(["-t", time]) + salloc_args.extend(["-t", _validate_time_format(time)]) if account: salloc_args.extend(["-A", str(account)]) @@ -273,6 +274,25 @@ def _get_alloc_cmd( return salloc_args +def _validate_time_format(time: str) -> str: + """Convert time into valid walltime format + + By defualt the formatted wall time is the total number of seconds. + + :param time: number of hours to run job + :type time: str + :returns: Formatted walltime + :rtype: str + """ + try: + hours, minutes, seconds = map(int, time.split(":")) + except ValueError as e: + raise ValueError( + "Input time must be formatted as `HH:MM:SS` with valid Integers." + ) from e + return fmt_walltime(hours, minutes, seconds) + + def get_hosts() -> t.List[str]: """Get the name of the nodes used in a slurm allocation. diff --git a/tests/__init__.py b/tests/__init__.py index bf6fd954c..efe03908e 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/run_sklearn_onnx.py b/tests/backends/run_sklearn_onnx.py index d4377bbde..f10c8c7fb 100644 --- a/tests/backends/run_sklearn_onnx.py +++ b/tests/backends/run_sklearn_onnx.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/run_tf.py b/tests/backends/run_tf.py index c9cf0ee04..ec5d0142b 100644 --- a/tests/backends/run_tf.py +++ b/tests/backends/run_tf.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/run_torch.py b/tests/backends/run_torch.py index d57cfad9d..6e9ba2859 100644 --- a/tests/backends/run_torch.py +++ b/tests/backends/run_torch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/test_cli_mini_exp.py b/tests/backends/test_cli_mini_exp.py index 7c793e915..f02f44270 100644 --- a/tests/backends/test_cli_mini_exp.py +++ b/tests/backends/test_cli_mini_exp.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index 37c4296ef..d02f3f33c 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 1cfc1efcb..75e9f515d 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -138,6 +138,7 @@ def create_tf_cnn(): def save_torch_cnn(path, file_name): n = PyTorchNet() + n.eval() example_forward_input = torch.rand(1, 1, 28, 28) module = torch.jit.trace(n, example_forward_input) torch.jit.save(module, path + "/" + file_name) @@ -858,3 +859,83 @@ def test_inconsistent_params_db_model(): ex.value.args[0] == "Cannot set devices_per_node>1 if CPU is specified under devices" ) + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +def test_db_model_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): + """Test DBModels on remote DB, with an ensemble""" + + # Set experiment name + exp_name = "test-db-model-ensemble-duplicate" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = 1 # TF backend fails on multiple GPUs + + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + + # Create the SmartSim Experiment + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Create RunSettings + run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings.set_nodes(1) + run_settings.set_tasks(1) + + # Create ensemble + smartsim_ensemble = exp.create_ensemble( + "smartsim_ensemble", run_settings=run_settings, replicas=2 + ) + + # Create Model + smartsim_model = exp.create_model("smartsim_model", run_settings) + + # Create and save ML model to filesystem + model, inputs, outputs = create_tf_cnn() + model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + + # Add the first ML model to all of the ensemble members + smartsim_ensemble.add_ml_model( + "cnn", + "TF", + model=model, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=inputs, + outputs=outputs, + ) + + # Attempt to add a duplicate ML model to Ensemble via Ensemble.add_ml_model() + with pytest.raises(SSUnsupportedError) as ex: + smartsim_ensemble.add_ml_model( + "cnn", + "TF", + model=model, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=inputs, + outputs=outputs, + ) + assert ex.value.args[0] == 'An ML Model with name "cnn" already exists' + + # Add same name ML model to a new SmartSim Model + smartsim_model.add_ml_model( + "cnn", + "TF", + model_path=model_file2, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=inputs2, + outputs=outputs2, + ) + + # Attempt to add a duplicate ML model to Ensemble via Ensemble.add_model() + with pytest.raises(SSUnsupportedError) as ex: + smartsim_ensemble.add_model(smartsim_model) + assert ex.value.args[0] == 'An ML Model with name "cnn" already exists' diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index e6cacd4d0..2bffd1da6 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -623,3 +623,107 @@ def test_inconsistent_params_db_script(fileutils): ex.value.args[0] == "Cannot set first_device>0 if CPU is specified under devices" ) + + +@pytest.mark.skipif(not should_run, reason="Test needs Torch to run") +def test_db_script_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): + """Test DB scripts on remote DB""" + + # Set experiment name + exp_name = "test-db-script" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + torch_script = fileutils.get_test_conf_path("torchscript.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Create RunSettings + run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings.set_nodes(1) + run_settings.set_tasks(1) + + # Create Ensemble with two identical models + ensemble = exp.create_ensemble( + "dbscript_ensemble", run_settings=run_settings, replicas=2 + ) + + # Create SmartSim model + smartsim_model = exp.create_model("smartsim_model", run_settings) + # Create 2nd SmartSim model + smartsim_model_2 = exp.create_model("smartsim_model_2", run_settings) + # Create the script string + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + # Add the first ML script to all of the ensemble members + ensemble.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + # Attempt to add a duplicate ML model to Ensemble via Ensemble.add_script() + with pytest.raises(SSUnsupportedError) as ex: + ensemble.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + assert ex.value.args[0] == 'A Script with name "test_script1" already exists' + + # Add the first function to all of the ensemble members + ensemble.add_function( + "test_func", + function=timestwo, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + # Attempt to add a duplicate ML model to Ensemble via Ensemble.add_function() + with pytest.raises(SSUnsupportedError) as ex: + ensemble.add_function( + "test_func", + function=timestwo, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + assert ex.value.args[0] == 'A Script with name "test_func" already exists' + + # Add a script with a non-unique name to a SmartSim Model + smartsim_model.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + with pytest.raises(SSUnsupportedError) as ex: + ensemble.add_model(smartsim_model) + assert ex.value.args[0] == 'A Script with name "test_script1" already exists' + + # Add a function with a non-unique name to a SmartSim Model + smartsim_model_2.add_function( + "test_func", + function=timestwo, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + with pytest.raises(SSUnsupportedError) as ex: + ensemble.add_model(smartsim_model_2) + assert ex.value.args[0] == 'A Script with name "test_func" already exists' diff --git a/tests/backends/test_onnx.py b/tests/backends/test_onnx.py index 19c40017e..7c0e97e41 100644 --- a/tests/backends/test_onnx.py +++ b/tests/backends/test_onnx.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py index 06c148a95..af04c89cb 100644 --- a/tests/backends/test_tf.py +++ b/tests/backends/test_tf.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/test_torch.py b/tests/backends/test_torch.py index 71a63adb9..76a989a2e 100644 --- a/tests/backends/test_torch.py +++ b/tests/backends/test_torch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 2b7db11e1..c69b1746a 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -58,8 +58,6 @@ def test_batch_model(fileutils, test_dir, wlmutils): batch_settings.set_account(wlmutils.get_test_account()) add_batch_resources(wlmutils, batch_settings) - if wlmutils.get_test_launcher() == "cobalt": - batch_settings.set_queue("debug-flat-quad") run_settings = wlmutils.get_run_settings("python", f"{script} --time=5") model = exp.create_model( "model", path=test_dir, run_settings=run_settings, batch_settings=batch_settings @@ -87,8 +85,6 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): add_batch_resources(wlmutils, batch) batch.set_account(wlmutils.get_test_account()) - if wlmutils.get_test_launcher() == "cobalt": - batch.set_queue("debug-flat-quad") ensemble = exp.create_ensemble("batch-ens", batch_settings=batch) ensemble.add_model(M1) ensemble.add_model(M2) @@ -110,12 +106,6 @@ def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): add_batch_resources(wlmutils, batch) batch.set_account(wlmutils.get_test_account()) - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - sleep(30) - batch.set_queue("debug-flat-quad") ensemble = exp.create_ensemble( "batch-ens-replicas", batch_settings=batch, run_settings=settings, replicas=2 ) diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index f1f5952b3..058aef895 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -60,9 +60,6 @@ def test_launch_orc_auto_batch(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:02:00") - if wlmutils.get_test_launcher() == "cobalt": - orc.batch_settings.set_queue("debug-flat-quad") - orc.set_path(test_dir) exp.start(orc, block=True) @@ -99,12 +96,6 @@ def test_launch_cluster_orc_batch_single(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:02:00") - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - time.sleep(120) - orc.batch_settings.set_queue("debug-flat-quad") orc.set_path(test_dir) exp.start(orc, block=True) @@ -141,12 +132,6 @@ def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:03:00") - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - time.sleep(120) - orc.batch_settings.set_queue("debug-flat-quad") orc.set_path(test_dir) exp.start(orc, block=True) @@ -178,12 +163,6 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:03:00") - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - time.sleep(120) - orc.batch_settings.set_queue("debug-flat-quad") exp.start(orc, block=True) diff --git a/tests/full_wlm/test_mpmd.py b/tests/full_wlm/test_mpmd.py index 18e918cfd..7f6cc2ea2 100644 --- a/tests/full_wlm/test_mpmd.py +++ b/tests/full_wlm/test_mpmd.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -48,9 +48,9 @@ def test_mpmd(fileutils, test_dir, wlmutils): exp_name = "test-mpmd" launcher = wlmutils.get_test_launcher() # MPMD is supported in LSF, but the test for it is different - mpmd_supported = ["slurm", "pbs", "cobalt"] + mpmd_supported = ["slurm", "pbs"] if launcher not in mpmd_supported: - pytest.skip("Test requires Slurm, PBS, or Cobalt to run") + pytest.skip("Test requires Slurm, or PBS to run") # aprun returns an error if the launched app is not an MPI exec # as we do not want to add mpi4py as a dependency, we prefer to @@ -58,7 +58,6 @@ def test_mpmd(fileutils, test_dir, wlmutils): by_launcher = { "slurm": ["srun", "mpirun"], "pbs": ["mpirun"], - "cobalt": ["mpirun"], } exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) diff --git a/tests/full_wlm/test_slurm_allocation.py b/tests/full_wlm/test_slurm_allocation.py index 01d40bf2f..95de1f426 100644 --- a/tests/full_wlm/test_slurm_allocation.py +++ b/tests/full_wlm/test_slurm_allocation.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -36,6 +36,29 @@ pytestmark = pytest.mark.skip(reason="Test is only for Slurm WLM systems") +def test_invalid_time_format(wlmutils): + """test slurm interface for formatting walltimes""" + account = wlmutils.get_test_account() + with pytest.raises(ValueError) as e: + alloc = slurm.get_allocation(nodes=1, time="000500", account=account) + assert ( + "Input time must be formatted as `HH:MM:SS` with valid Integers." + in e.value.args[0] + ) + with pytest.raises(ValueError) as e: + alloc = slurm.get_allocation(nodes=1, time="00-05-00", account=account) + assert ( + "Input time must be formatted as `HH:MM:SS` with valid Integers." + in e.value.args[0] + ) + with pytest.raises(ValueError) as e: + alloc = slurm.get_allocation(nodes=1, time="TE:HE:HE", account=account) + assert ( + "Input time must be formatted as `HH:MM:SS` with valid Integers." + in e.value.args[0] + ) + + def test_get_release_allocation(wlmutils): """test slurm interface for obtaining allocations""" account = wlmutils.get_test_account() diff --git a/tests/full_wlm/test_wlm_helper_functions.py b/tests/full_wlm/test_wlm_helper_functions.py index 452d10419..5723939f5 100644 --- a/tests/full_wlm/test_wlm_helper_functions.py +++ b/tests/full_wlm/test_wlm_helper_functions.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_buildenv.py b/tests/install/test_buildenv.py similarity index 98% rename from tests/test_buildenv.py rename to tests/install/test_buildenv.py index d362ca1dd..21b9a49b8 100644 --- a/tests/test_buildenv.py +++ b/tests/install/test_buildenv.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/install/test_builder.py b/tests/install/test_builder.py new file mode 100644 index 000000000..5e6c8e597 --- /dev/null +++ b/tests/install/test_builder.py @@ -0,0 +1,364 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import functools +import pathlib +import platform +import threading +import time + +import pytest + +import smartsim._core._install.builder as build +from smartsim._core._install.buildenv import RedisAIVersion + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +RAI_VERSIONS = RedisAIVersion("1.2.7") + +for_each_device = pytest.mark.parametrize("device", ["cpu", "gpu"]) + +_toggle_build_optional_backend = lambda backend: pytest.mark.parametrize( + f"build_{backend}", + [ + pytest.param(switch, id=f"with{'' if switch else 'out'}-{backend}") + for switch in (True, False) + ], +) +toggle_build_tf = _toggle_build_optional_backend("tf") +toggle_build_pt = _toggle_build_optional_backend("pt") +toggle_build_ort = _toggle_build_optional_backend("ort") + + +@pytest.mark.parametrize( + "mock_os", [pytest.param(os_, id=f"os='{os_}'") for os_ in ("Windows", "Java", "")] +) +def test_os_enum_raises_on_unsupported(mock_os): + with pytest.raises(build.BuildError, match="operating system") as err_info: + build.OperatingSystem.from_str(mock_os) + + +@pytest.mark.parametrize( + "mock_arch", + [ + pytest.param(arch_, id=f"arch='{arch_}'") + for arch_ in ("i386", "i686", "i86pc", "aarch64", "armv7l", "") + ], +) +def test_arch_enum_raises_on_unsupported(mock_arch): + with pytest.raises(build.BuildError, match="architecture"): + build.Architecture.from_str(mock_arch) + + +@pytest.fixture +def p_test_dir(test_dir): + yield pathlib.Path(test_dir).resolve() + + +@for_each_device +def test_rai_builder_raises_if_attempting_to_place_deps_when_build_dir_dne( + monkeypatch, p_test_dir, device +): + monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) + monkeypatch.setattr( + build.RedisAIBuilder, + "rai_build_path", + property(lambda self: p_test_dir / "path/to/dir/that/dne"), + ) + rai_builder = build.RedisAIBuilder() + with pytest.raises(build.BuildError, match=r"build directory not found"): + rai_builder._fetch_deps_for(device) + + +@for_each_device +def test_rai_builder_raises_if_attempting_to_place_deps_in_nonempty_dir( + monkeypatch, p_test_dir, device +): + (p_test_dir / "some_file.txt").touch() + monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) + monkeypatch.setattr( + build.RedisAIBuilder, "rai_build_path", property(lambda self: p_test_dir) + ) + monkeypatch.setattr( + build.RedisAIBuilder, "get_deps_dir_path_for", lambda *a, **kw: p_test_dir + ) + rai_builder = build.RedisAIBuilder() + + with pytest.raises(build.BuildError, match=r"is not empty"): + rai_builder._fetch_deps_for(device) + + +invalid_build_arm64 = [ + dict(build_tf=True, build_onnx=True), + dict(build_tf=False, build_onnx=True), + dict(build_tf=True, build_onnx=False), +] +invalid_build_ids = [ + ",".join([f"{key}={value}" for key, value in d.items()]) + for d in invalid_build_arm64 +] + + +@pytest.mark.parametrize("build_options", invalid_build_arm64, ids=invalid_build_ids) +def test_rai_builder_raises_if_unsupported_deps_on_arm64(build_options): + with pytest.raises(build.BuildError, match=r"are not supported on.*ARM64"): + build.RedisAIBuilder( + _os=build.OperatingSystem.DARWIN, + architecture=build.Architecture.ARM64, + **build_options, + ) + + +def _confirm_inst_presence(type_, should_be_present, seq): + expected_num_occurrences = 1 if should_be_present else 0 + occurrences = filter(lambda item: isinstance(item, type_), seq) + return expected_num_occurrences == len(tuple(occurrences)) + + +# Helper functions to check for the presence (or absence) of a +# ``_RAIBuildDependency`` dependency in a list of dependencies that need to be +# fetched by a ``RedisAIBuilder`` instance +dlpack_dep_presence = functools.partial( + _confirm_inst_presence, build._DLPackRepository, True +) +pt_dep_presence = functools.partial(_confirm_inst_presence, build._PTArchive) +tf_dep_presence = functools.partial(_confirm_inst_presence, build._TFArchive) +ort_dep_presence = functools.partial(_confirm_inst_presence, build._ORTArchive) + + +@for_each_device +@toggle_build_tf +@toggle_build_pt +@toggle_build_ort +def test_rai_builder_will_add_dep_if_backend_requested_wo_duplicates( + monkeypatch, device, build_tf, build_pt, build_ort +): + monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) + + rai_builder = build.RedisAIBuilder( + build_tf=build_tf, build_torch=build_pt, build_onnx=build_ort + ) + requested_backends = rai_builder._get_deps_to_fetch_for(device) + assert dlpack_dep_presence(requested_backends) + assert tf_dep_presence(build_tf, requested_backends) + assert pt_dep_presence(build_pt, requested_backends) + assert ort_dep_presence(build_ort, requested_backends) + + +@for_each_device +@toggle_build_tf +@toggle_build_pt +def test_rai_builder_will_not_add_dep_if_custom_dep_path_provided( + monkeypatch, device, p_test_dir, build_tf, build_pt +): + monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) + mock_ml_lib = p_test_dir / "some/ml/lib" + mock_ml_lib.mkdir(parents=True) + rai_builder = build.RedisAIBuilder( + build_tf=build_tf, + build_torch=build_pt, + build_onnx=False, + libtf_dir=str(mock_ml_lib if build_tf else ""), + torch_dir=str(mock_ml_lib if build_pt else ""), + ) + requested_backends = rai_builder._get_deps_to_fetch_for(device) + assert dlpack_dep_presence(requested_backends) + assert tf_dep_presence(False, requested_backends) + assert pt_dep_presence(False, requested_backends) + assert ort_dep_presence(False, requested_backends) + assert len(requested_backends) == 1 + + +def test_rai_builder_raises_if_it_fetches_an_unexpected_number_of_ml_deps( + monkeypatch, p_test_dir +): + monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) + monkeypatch.setattr( + build.RedisAIBuilder, "rai_build_path", property(lambda self: p_test_dir) + ) + monkeypatch.setattr( + build, + "_place_rai_dep_at", + lambda target, verbose: lambda dep: target + / "whoops_all_ml_deps_extract_to_a_dir_with_this_name", + ) + rai_builder = build.RedisAIBuilder(build_tf=True, build_torch=True, build_onnx=True) + with pytest.raises( + build.BuildError, + match=r"Expected to place \d+ dependencies, but only found \d+", + ): + rai_builder._fetch_deps_for("cpu") + + +def test_threaded_map(): + def _some_io_op(x): + return x * x + + assert (0, 1, 4, 9, 16) == tuple(build._threaded_map(_some_io_op, range(5))) + + +def test_threaded_map_returns_early_if_nothing_to_map(): + sleep_duration = 60 + + def _some_long_io_op(_): + time.sleep(sleep_duration) + + start = time.time() + build._threaded_map(_some_long_io_op, []) + end = time.time() + assert end - start < sleep_duration + + +def test_correct_pt_variant_os(): + # Check that all Linux variants return Linux + for linux_variant in build.OperatingSystem.LINUX.value: + os_ = build.OperatingSystem.from_str(linux_variant) + assert build._choose_pt_variant(os_) == build._PTArchiveLinux + + # Check that ARM64 and X86_64 Mac OSX return the Mac variant + all_archs = (build.Architecture.ARM64, build.Architecture.X64) + for arch in all_archs: + os_ = build.OperatingSystem.DARWIN + assert build._choose_pt_variant(os_) == build._PTArchiveMacOSX + + +def test_PTArchiveMacOSX_url(): + arch = build.Architecture.X64 + pt_version = RAI_VERSIONS.torch + + pt_linux_cpu = build._PTArchiveLinux(build.Architecture.X64, "cpu", pt_version) + x64_prefix = "https://download.pytorch.org/libtorch/" + assert x64_prefix in pt_linux_cpu.url + + pt_macosx_cpu = build._PTArchiveMacOSX(build.Architecture.ARM64, "cpu", pt_version) + arm64_prefix = "https://github.com/CrayLabs/ml_lib_builder/releases/download/" + assert arm64_prefix in pt_macosx_cpu.url + + +def test_PTArchiveMacOSX_gpu_error(): + with pytest.raises(build.BuildError, match="support GPU on Mac OSX"): + build._PTArchiveMacOSX(build.Architecture.ARM64, "gpu", RAI_VERSIONS.torch).url + + +def test_valid_platforms(): + assert build.RedisAIBuilder( + _os=build.OperatingSystem.LINUX, + architecture=build.Architecture.X64, + build_tf=True, + build_torch=True, + build_onnx=True, + ) + assert build.RedisAIBuilder( + _os=build.OperatingSystem.DARWIN, + architecture=build.Architecture.X64, + build_tf=True, + build_torch=True, + build_onnx=False, + ) + assert build.RedisAIBuilder( + _os=build.OperatingSystem.DARWIN, + architecture=build.Architecture.X64, + build_tf=False, + build_torch=True, + build_onnx=False, + ) + + +@pytest.mark.parametrize( + "plat,cmd,expected_cmd", + [ + # Bare Word + pytest.param( + build.Platform(build.OperatingSystem.LINUX, build.Architecture.X64), + ["git", "clone", "my-repo"], + ["git", "clone", "my-repo"], + id="git-Linux-X64", + ), + pytest.param( + build.Platform(build.OperatingSystem.LINUX, build.Architecture.ARM64), + ["git", "clone", "my-repo"], + ["git", "clone", "my-repo"], + id="git-Linux-Arm64", + ), + pytest.param( + build.Platform(build.OperatingSystem.DARWIN, build.Architecture.X64), + ["git", "clone", "my-repo"], + ["git", "clone", "my-repo"], + id="git-Darwin-X64", + ), + pytest.param( + build.Platform(build.OperatingSystem.DARWIN, build.Architecture.ARM64), + ["git", "clone", "my-repo"], + [ + "git", + "clone", + "--config", + "core.autocrlf=false", + "--config", + "core.eol=lf", + "my-repo", + ], + id="git-Darwin-Arm64", + ), + # Abs path + pytest.param( + build.Platform(build.OperatingSystem.LINUX, build.Architecture.X64), + ["/path/to/git", "clone", "my-repo"], + ["/path/to/git", "clone", "my-repo"], + id="Abs-Linux-X64", + ), + pytest.param( + build.Platform(build.OperatingSystem.LINUX, build.Architecture.ARM64), + ["/path/to/git", "clone", "my-repo"], + ["/path/to/git", "clone", "my-repo"], + id="Abs-Linux-Arm64", + ), + pytest.param( + build.Platform(build.OperatingSystem.DARWIN, build.Architecture.X64), + ["/path/to/git", "clone", "my-repo"], + ["/path/to/git", "clone", "my-repo"], + id="Abs-Darwin-X64", + ), + pytest.param( + build.Platform(build.OperatingSystem.DARWIN, build.Architecture.ARM64), + ["/path/to/git", "clone", "my-repo"], + [ + "/path/to/git", + "clone", + "--config", + "core.autocrlf=false", + "--config", + "core.eol=lf", + "my-repo", + ], + id="Abs-Darwin-Arm64", + ), + ], +) +def test_git_commands_are_configered_correctly_for_platforms(plat, cmd, expected_cmd): + assert build.config_git_command(plat, cmd) == expected_cmd diff --git a/tests/on_wlm/test_base_settings_on_wlm.py b/tests/on_wlm/test_base_settings_on_wlm.py index d75cc635f..0b31eedd2 100644 --- a/tests/on_wlm/test_base_settings_on_wlm.py +++ b/tests/on_wlm/test_base_settings_on_wlm.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index fa05eb513..8baf74bf4 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_containers_wlm.py b/tests/on_wlm/test_containers_wlm.py index 198a92f43..8dc4baae0 100644 --- a/tests/on_wlm/test_containers_wlm.py +++ b/tests/on_wlm/test_containers_wlm.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index ab100d1a7..6cf1c3918 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_het_job.py b/tests/on_wlm/test_het_job.py index e8f20d1ee..5a039a7c9 100644 --- a/tests/on_wlm/test_het_job.py +++ b/tests/on_wlm/test_het_job.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_launch_errors.py b/tests/on_wlm/test_launch_errors.py index 7281cb568..905d96f54 100644 --- a/tests/on_wlm/test_launch_errors.py +++ b/tests/on_wlm/test_launch_errors.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_launch_ompi_lsf.py b/tests/on_wlm/test_launch_ompi_lsf.py index ed082d22e..ed5de291b 100644 --- a/tests/on_wlm/test_launch_ompi_lsf.py +++ b/tests/on_wlm/test_launch_ompi_lsf.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_local_step.py b/tests/on_wlm/test_local_step.py index 02e619ebf..4e5f45e0b 100644 --- a/tests/on_wlm/test_local_step.py +++ b/tests/on_wlm/test_local_step.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_restart.py b/tests/on_wlm/test_restart.py index b1997961d..42bbe752c 100644 --- a/tests/on_wlm/test_restart.py +++ b/tests/on_wlm/test_restart.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_simple_base_settings_on_wlm.py b/tests/on_wlm/test_simple_base_settings_on_wlm.py index 08bf875e2..1611781eb 100644 --- a/tests/on_wlm/test_simple_base_settings_on_wlm.py +++ b/tests/on_wlm/test_simple_base_settings_on_wlm.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -50,10 +50,8 @@ def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() - if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: - pytest.skip( - "Test only runs on systems with LSF, PBSPro, Slurm, or Cobalt as WLM" - ) + if launcher not in ["pbs", "slurm", "lsf"]: + pytest.skip("Test only runs on systems with LSF, PBSPro, or Slurm as WLM") exp_name = "test-simplebase-settings-model-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) @@ -70,10 +68,8 @@ def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): def test_simple_model_stop_on_wlm(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() - if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: - pytest.skip( - "Test only runs on systems with LSF, PBSPro, Slurm, or Cobalt as WLM" - ) + if launcher not in ["pbs", "slurm", "lsf"]: + pytest.skip("Test only runs on systems with LSF, PBSPro, or Slurm as WLM") exp_name = "test-simplebase-settings-model-stop" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index a05d7be0f..1ecc27442 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_slurm_commands.py b/tests/on_wlm/test_slurm_commands.py index d3ebbcd31..8411be6e0 100644 --- a/tests/on_wlm/test_slurm_commands.py +++ b/tests/on_wlm/test_slurm_commands.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_stop.py b/tests/on_wlm/test_stop.py index 383c6c4bd..8d75d9f65 100644 --- a/tests/on_wlm/test_stop.py +++ b/tests/on_wlm/test_stop.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_wlm_orc_config_settings.py b/tests/on_wlm/test_wlm_orc_config_settings.py index 3de59075e..f9ab60609 100644 --- a/tests/on_wlm/test_wlm_orc_config_settings.py +++ b/tests/on_wlm/test_wlm_orc_config_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_alps_settings.py b/tests/test_alps_settings.py index 012f27fce..b3c4c3bdb 100644 --- a/tests/test_alps_settings.py +++ b/tests/test_alps_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_batch_settings.py b/tests/test_batch_settings.py index cb2096727..db269a9b5 100644 --- a/tests/test_batch_settings.py +++ b/tests/test_batch_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_cli.py b/tests/test_cli.py index 899caa1e0..710a9a659 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -828,3 +828,33 @@ def test_cli_validation_test_execute( assert expected_stdout in caplog.text assert actual_retval == expected_retval + + +def test_validate_correctly_sets_and_restores_env(monkeypatch): + monkeypatch.setenv("FOO", "BAR") + monkeypatch.setenv("SPAM", "EGGS") + monkeypatch.delenv("TICK", raising=False) + monkeypatch.delenv("DNE", raising=False) + + assert os.environ["FOO"] == "BAR" + assert os.environ["SPAM"] == "EGGS" + assert "TICK" not in os.environ + assert "DNE" not in os.environ + + with smartsim._core._cli.validate._env_vars_set_to( + { + "FOO": "BAZ", # Redefine + "SPAM": None, # Delete + "TICK": "TOCK", # Add + "DNE": None, # Delete already missing + } + ): + assert os.environ["FOO"] == "BAZ" + assert "SPAM" not in os.environ + assert os.environ["TICK"] == "TOCK" + assert "DNE" not in os.environ + + assert os.environ["FOO"] == "BAR" + assert os.environ["SPAM"] == "EGGS" + assert "TICK" not in os.environ + assert "DNE" not in os.environ diff --git a/tests/test_cobalt_parser.py b/tests/test_cobalt_parser.py deleted file mode 100644 index e91c95100..000000000 --- a/tests/test_cobalt_parser.py +++ /dev/null @@ -1,54 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -from smartsim._core.launcher.cobalt import cobaltParser - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - - -def test_parse_step_id(): - output = "JobName JobId \n" "=====================\n" "smartsim 507975 \n" - step_id = cobaltParser.parse_cobalt_step_id(output, "smartsim") - assert step_id == "507975" - - -def test_parse_step_status(): - output = "JobName State \n" "=====================\n" "smartsim running \n" - step_id = cobaltParser.parse_cobalt_step_status(output, "smartsim") - assert step_id == "running" - - -def test_parse_qsub_out(): - output = ( - "Job routed to queue 'debug-flat-quad'.\n" - "Memory mode set to flat quad for queue debug-flat-quad\n" - "507998\n" - ) - step_id = cobaltParser.parse_qsub_out(output) - assert step_id == "507998" diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index e68801762..138ceb4b7 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_colo_model_lsf.py b/tests/test_colo_model_lsf.py index e77eeedec..5e1c449cc 100644 --- a/tests/test_colo_model_lsf.py +++ b/tests/test_colo_model_lsf.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_config.py b/tests/test_config.py index bbbb54526..0716ac0d5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -205,7 +205,7 @@ def test_redis_cli(): pytest.param("0", False, id="letter zero"), pytest.param("1", True, id="letter one"), pytest.param("-1", False, id="letter negative one"), - pytest.param(None, False, id="not in env"), + pytest.param(None, True, id="not in env"), ], ) def test_telemetry_flag( diff --git a/tests/test_configs/bad.py b/tests/test_configs/bad.py index 93e4864ff..4efe8b9a1 100644 --- a/tests/test_configs/bad.py +++ b/tests/test_configs/bad.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/check_dirs.py b/tests/test_configs/check_dirs.py index 07b358d49..b817bde9a 100644 --- a/tests/test_configs/check_dirs.py +++ b/tests/test_configs/check_dirs.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/cov/local_cov.cfg b/tests/test_configs/cov/local_cov.cfg index d25e9f83a..481cc08c1 100644 --- a/tests/test_configs/cov/local_cov.cfg +++ b/tests/test_configs/cov/local_cov.cfg @@ -2,7 +2,6 @@ omit = *pbs* *slurm* - *cobalt* *mpirun* *alps* *lsf* @@ -39,7 +38,6 @@ exclude_lines= pragma: no cover cov-pbs cov-slurm - cov-cobalt cov-alps cov-mpirun cov-wlm @@ -49,6 +47,5 @@ exclude_lines= launcher == "slurm" launcher == "pbs" - launcher == "cobalt" launcher == "lsf" launcher == "pals" diff --git a/tests/test_configs/cov/lsf_cov.cfg b/tests/test_configs/cov/lsf_cov.cfg index 6e5f52eb4..03b27c5ec 100644 --- a/tests/test_configs/cov/lsf_cov.cfg +++ b/tests/test_configs/cov/lsf_cov.cfg @@ -2,7 +2,6 @@ omit = *slurm* *local* - *cobalt* *pbs* *alps* *redis_starter.py* @@ -36,11 +35,9 @@ exclude_lines= cov-slurm cov-local - cov-cobalt cov-alps cov-pbs pass launcher == "local" launcher == "slurm" - launcher == "cobalt" launcher == "pbs" diff --git a/tests/test_configs/cov/pbs_cov.cfg b/tests/test_configs/cov/pbs_cov.cfg index 99e7bcfd6..f9274cbf6 100644 --- a/tests/test_configs/cov/pbs_cov.cfg +++ b/tests/test_configs/cov/pbs_cov.cfg @@ -2,7 +2,6 @@ omit = *slurm* *local* - *cobalt* *mpirun* *alps* *lsf* @@ -37,11 +36,9 @@ exclude_lines= cov-slurm cov-local - cov-cobalt cov-alps cov-lsf pass launcher == "local" launcher == "slurm" - launcher == "cobalt" launcher == "lsf" diff --git a/tests/test_configs/cov/slurm_cov.cfg b/tests/test_configs/cov/slurm_cov.cfg index 59405bc35..5aa77cfbe 100644 --- a/tests/test_configs/cov/slurm_cov.cfg +++ b/tests/test_configs/cov/slurm_cov.cfg @@ -2,7 +2,6 @@ omit = *pbs* *local* - *cobalt* *mpirun* *alps* *lsf* @@ -37,11 +36,9 @@ exclude_lines= cov-pbs cov-local - cov-cobalt cov-alps cov-lsf pass launcher == "local" launcher == "pbs" - launcher == "cobalt" launcher == "lsf" diff --git a/tests/test_configs/echo.py b/tests/test_configs/echo.py index 6523f4e4f..5d9a57ebb 100644 --- a/tests/test_configs/echo.py +++ b/tests/test_configs/echo.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/generator_files/circular_config/sub_dir/hello.sh b/tests/test_configs/generator_files/circular_config/sub_dir/hello.sh index 705c1dcc6..6d9be1214 100644 --- a/tests/test_configs/generator_files/circular_config/sub_dir/hello.sh +++ b/tests/test_configs/generator_files/circular_config/sub_dir/hello.sh @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/generator_files/multi_tags_template.sh b/tests/test_configs/generator_files/multi_tags_template.sh index 4fd79d321..a7131e892 100644 --- a/tests/test_configs/generator_files/multi_tags_template.sh +++ b/tests/test_configs/generator_files/multi_tags_template.sh @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/generator_files/test_dir/test.in b/tests/test_configs/generator_files/test_dir/test.in index 8a0a76ee2..f91f0256c 100644 --- a/tests/test_configs/generator_files/test_dir/test.in +++ b/tests/test_configs/generator_files/test_dir/test.in @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/hw_mpi.c b/tests/test_configs/hw_mpi.c index 4cfa04631..995047bb0 100644 --- a/tests/test_configs/hw_mpi.c +++ b/tests/test_configs/hw_mpi.c @@ -1,7 +1,7 @@ /* * BSD 2-Clause License * - * Copyright (c) 2021-2023, Hewlett Packard Enterprise + * Copyright (c) 2021-2024, Hewlett Packard Enterprise * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/incoming_entities_reader.py b/tests/test_configs/incoming_entities_reader.py index c558271e3..32d670ed1 100644 --- a/tests/test_configs/incoming_entities_reader.py +++ b/tests/test_configs/incoming_entities_reader.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/ml/training_service_torch.py b/tests/test_configs/ml/training_service_torch.py index 575940031..2a6bac051 100644 --- a/tests/test_configs/ml/training_service_torch.py +++ b/tests/test_configs/ml/training_service_torch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/mpi_impl_stubs/openmpi4/mpiexec b/tests/test_configs/mpi_impl_stubs/openmpi4/mpiexec index a5e1157d3..b8c34bb63 100755 --- a/tests/test_configs/mpi_impl_stubs/openmpi4/mpiexec +++ b/tests/test_configs/mpi_impl_stubs/openmpi4/mpiexec @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/mpi_impl_stubs/openmpi4/mpirun b/tests/test_configs/mpi_impl_stubs/openmpi4/mpirun index 9cfffde3d..fa1d3d0ac 100755 --- a/tests/test_configs/mpi_impl_stubs/openmpi4/mpirun +++ b/tests/test_configs/mpi_impl_stubs/openmpi4/mpirun @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/mpi_impl_stubs/openmpi4/orterun b/tests/test_configs/mpi_impl_stubs/openmpi4/orterun index 66de76417..31f7dd9aa 100755 --- a/tests/test_configs/mpi_impl_stubs/openmpi4/orterun +++ b/tests/test_configs/mpi_impl_stubs/openmpi4/orterun @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/mpi_impl_stubs/pals/mpiexec b/tests/test_configs/mpi_impl_stubs/pals/mpiexec index 70b27d177..393ee2e3a 100755 --- a/tests/test_configs/mpi_impl_stubs/pals/mpiexec +++ b/tests/test_configs/mpi_impl_stubs/pals/mpiexec @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/mpi_impl_stubs/slurm/mpiexec b/tests/test_configs/mpi_impl_stubs/slurm/mpiexec index 46fbacf58..07ff9881d 100755 --- a/tests/test_configs/mpi_impl_stubs/slurm/mpiexec +++ b/tests/test_configs/mpi_impl_stubs/slurm/mpiexec @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/reconnect_node.py b/tests/test_configs/reconnect_node.py index 3ff3d71ef..1897529d6 100644 --- a/tests/test_configs/reconnect_node.py +++ b/tests/test_configs/reconnect_node.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/reconnect_sim.py b/tests/test_configs/reconnect_sim.py index eda634517..a8952406d 100644 --- a/tests/test_configs/reconnect_sim.py +++ b/tests/test_configs/reconnect_sim.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/run_dbscript_smartredis.py b/tests/test_configs/run_dbscript_smartredis.py index b486b3fcd..2fdab961b 100644 --- a/tests/test_configs/run_dbscript_smartredis.py +++ b/tests/test_configs/run_dbscript_smartredis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/run_pt_dbmodel_smartredis.py b/tests/test_configs/run_pt_dbmodel_smartredis.py index 600ae2cb3..dd869c65a 100644 --- a/tests/test_configs/run_pt_dbmodel_smartredis.py +++ b/tests/test_configs/run_pt_dbmodel_smartredis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/run_tf_dbmodel_smartredis.py b/tests/test_configs/run_tf_dbmodel_smartredis.py index 874b40c9c..a76009401 100644 --- a/tests/test_configs/run_tf_dbmodel_smartredis.py +++ b/tests/test_configs/run_tf_dbmodel_smartredis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/send_data.py b/tests/test_configs/send_data.py index dd9194031..f9b9440c4 100644 --- a/tests/test_configs/send_data.py +++ b/tests/test_configs/send_data.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/send_data_local_smartredis.py b/tests/test_configs/send_data_local_smartredis.py index 0c318736f..34191bcca 100644 --- a/tests/test_configs/send_data_local_smartredis.py +++ b/tests/test_configs/send_data_local_smartredis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/sleep.py b/tests/test_configs/sleep.py index 778b8946e..d74d43bf1 100644 --- a/tests/test_configs/sleep.py +++ b/tests/test_configs/sleep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/torchscript.py b/tests/test_configs/torchscript.py index 1ec0e71d5..a90f165aa 100644 --- a/tests/test_configs/torchscript.py +++ b/tests/test_configs/torchscript.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_containers.py b/tests/test_containers.py index e35b4f309..21fe50ad4 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_controller.py b/tests/test_controller.py index 65687ec59..149872708 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index a40ccdf66..a02c17678 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index ec0ed23ea..227572ac9 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py index 4545e80bf..0632eee16 100644 --- a/tests/test_ensemble.py +++ b/tests/test_ensemble.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_entitylist.py b/tests/test_entitylist.py index 675e84426..89f56b7ab 100644 --- a/tests/test_entitylist.py +++ b/tests/test_entitylist.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_experiment.py b/tests/test_experiment.py index c0185ab6d..12b2f1579 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,6 +31,7 @@ from smartsim._core.config import CONFIG from smartsim.entity import Model from smartsim.error import SmartSimError +from smartsim.error.errors import SSUnsupportedError from smartsim.settings import RunSettings from smartsim.status import STATUS_NEVER_STARTED @@ -178,3 +179,8 @@ def test_enable_disable_telemtery(monkeypatch): assert CONFIG.telemetry_enabled exp.disable_telemetry() assert not CONFIG.telemetry_enabled + + +def test_error_on_cobalt(): + with pytest.raises(SSUnsupportedError): + exp = Experiment("cobalt_exp", launcher="cobalt") diff --git a/tests/test_generator.py b/tests/test_generator.py index e4618f9cd..fd9a5b836 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 784219f82..025f53d32 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_indirect.py b/tests/test_indirect.py index f8af88266..73f381441 100644 --- a/tests/test_indirect.py +++ b/tests/test_indirect.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,9 +31,9 @@ import psutil import pytest +from smartsim._core.config import CONFIG from smartsim._core.entrypoints.indirect import cleanup, get_parser, get_ts, main from smartsim._core.utils.helpers import encode_cmd -from smartsim._core.utils.serialize import MANIFEST_FILENAME, TELMON_SUBDIR ALL_ARGS = { "+command", @@ -152,7 +152,7 @@ def test_indirect_main_dir_check(test_dir): cmd = ["echo", "unit-test"] encoded_cmd = encode_cmd(cmd) - status_path = exp_dir / TELMON_SUBDIR + status_path = exp_dir / CONFIG.telemetry_subdir # show that a missing status_path is created when missing main(encoded_cmd, "application", exp_dir, status_path) @@ -167,7 +167,7 @@ def test_indirect_main_cmd_check(capsys, test_dir, monkeypatch): captured = capsys.readouterr() # throw away existing output with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: ctx.setattr("smartsim._core.entrypoints.indirect.logger.error", print) - _ = main("", "application", exp_dir, exp_dir / TELMON_SUBDIR) + _ = main("", "application", exp_dir, exp_dir / CONFIG.telemetry_subdir) captured = capsys.readouterr() assert "Invalid cmd supplied" in ex.value.args[0] @@ -175,7 +175,8 @@ def test_indirect_main_cmd_check(capsys, test_dir, monkeypatch): # test with non-emptystring cmd with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: ctx.setattr("smartsim._core.entrypoints.indirect.logger.error", print) - _ = main(" \n \t ", "application", exp_dir, exp_dir / TELMON_SUBDIR) + status_dir = exp_dir / CONFIG.telemetry_subdir + _ = main(" \n \t ", "application", exp_dir, status_dir) captured = capsys.readouterr() assert "Invalid cmd supplied" in ex.value.args[0] @@ -190,13 +191,13 @@ def test_complete_process(fileutils, test_dir): raw_cmd = f"{sys.executable} {script} --time=1" cmd = encode_cmd(raw_cmd.split()) - rc = main(cmd, "application", exp_dir, exp_dir / TELMON_SUBDIR) + rc = main(cmd, "application", exp_dir, exp_dir / CONFIG.telemetry_subdir) assert rc == 0 assert exp_dir.exists() # NOTE: don't have a manifest so we're falling back to default event path - data_dir = exp_dir / TELMON_SUBDIR + data_dir = exp_dir / CONFIG.telemetry_subdir start_events = list(data_dir.rglob("start.json")) stop_events = list(data_dir.rglob("stop.json")) diff --git a/tests/test_init.py b/tests/test_init.py index 76f58b59a..dfb58bd55 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_interrupt.py b/tests/test_interrupt.py index 73616a848..28c48e0db 100644 --- a/tests/test_interrupt.py +++ b/tests/test_interrupt.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index 51d8b60a6..0557f3cf4 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_local_launch.py b/tests/test_local_launch.py index e4d593b6f..7befff95e 100644 --- a/tests/test_local_launch.py +++ b/tests/test_local_launch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_local_multi_run.py b/tests/test_local_multi_run.py index b6eaba56a..576e290ca 100644 --- a/tests/test_local_multi_run.py +++ b/tests/test_local_multi_run.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_local_restart.py b/tests/test_local_restart.py index b05401138..c59aebd7b 100644 --- a/tests/test_local_restart.py +++ b/tests/test_local_restart.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_logs.py b/tests/test_logs.py new file mode 100644 index 000000000..88c6a738f --- /dev/null +++ b/tests/test_logs.py @@ -0,0 +1,212 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import logging +import pathlib + +import pytest + +import smartsim +import smartsim.log +from smartsim import Experiment + +_CFG_TM_ENABLED_ATTR = "telemetry_enabled" + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + +@pytest.fixture +def turn_on_tm(monkeypatch): + monkeypatch.setattr( + smartsim._core.config.config.Config, + _CFG_TM_ENABLED_ATTR, + property(lambda self: True), + ) + yield + + +@pytest.mark.parametrize( + "level,expect_d,expect_i,expect_w,expect_e", + [ + pytest.param("DEBUG", True, False, False, False, id="debug-level"), + pytest.param("INFO", True, True, False, False, id="info-level"), + pytest.param("WARNING", True, True, True, False, id="warn-level"), + pytest.param("ERROR", True, True, True, True, id="err-level"), + ], +) +def test_lowpass_filter(level, expect_d, expect_i, expect_w, expect_e): + """Ensure that messages above maximum are not logged""" + log_filter = smartsim.log.LowPassFilter(level) + + faux_out_stream = io.StringIO() + handler = logging.StreamHandler(faux_out_stream) + handler.setFormatter(logging.Formatter("%(message)s")) + + logger = logging.getLogger(f"test_level_filter_{level}") + logger.addHandler(handler) + logger.addFilter(log_filter) + + logger.debug(str(logging.DEBUG)) + logger.info(str(logging.INFO)) + logger.warning(str(logging.WARNING)) + logger.exception(str(logging.ERROR)) + + logged_messages = faux_out_stream.getvalue().split("\n") + assert (str(logging.DEBUG) in logged_messages) == expect_d + assert (str(logging.INFO) in logged_messages) == expect_i + assert (str(logging.WARN) in logged_messages) == expect_w + assert (str(logging.ERROR) in logged_messages) == expect_e + + +def test_add_exp_loggers(test_dir): + """Ensure that expected loggers are added""" + # test_dir = fileutils.make_test_dir() + faux_out_stream = io.StringIO() + + logger = logging.getLogger("smartsim_test_add_exp_loggers") + logger.addHandler(logging.StreamHandler(faux_out_stream)) + + out_file = pathlib.Path(test_dir) / "smartsim.out" + err_file = pathlib.Path(test_dir) / "smartsim.err" + + filter_fn = lambda x: True + + smartsim.log.log_to_exp_file(str(out_file), logger, log_filter=filter_fn) + smartsim.log.log_to_exp_file(str(err_file), logger, "WARN") + + logger.debug("debug") + logger.exception("exception") + + assert out_file.exists() + assert out_file.is_file() + + assert err_file.exists() + assert err_file.is_file() + + +def test_get_logger(test_dir: str, turn_on_tm, monkeypatch): + """Ensure the correct logger type is instantiated""" + monkeypatch.setenv("SMARTSIM_LOG_LEVEL", "developer") + logger = smartsim.log.get_logger("SmartSimTest", "INFO") + assert isinstance(logger, smartsim.log.ContextAwareLogger) + + +@pytest.mark.parametrize( + "input_level,exp_level", + [ + pytest.param("INFO", "info", id="lowercasing only, INFO"), + pytest.param("info", "info", id="input back, info"), + pytest.param("WARNING", "warning", id="lowercasing only, WARNING"), + pytest.param("warning", "warning", id="input back, warning"), + pytest.param("QUIET", "warning", id="lowercasing only, QUIET"), + pytest.param("quiet", "warning", id="translation back, quiet"), + pytest.param("DEVELOPER", "debug", id="lowercasing only, DEVELOPER"), + pytest.param("developer", "debug", id="translation back, developer"), + ], +) +def test_translate_log_level(input_level: str, exp_level: str, turn_on_tm): + """Ensure the correct logger type is instantiated""" + translated_level = smartsim.log._translate_log_level(input_level) + assert exp_level == translated_level + + +def test_exp_logs(test_dir: str, turn_on_tm, monkeypatch): + """Ensure that experiment loggers are added when context info exists""" + monkeypatch.setenv("SMARTSIM_LOG_LEVEL", "developer") + test_dir = pathlib.Path(test_dir) + test_dir.mkdir(parents=True, exist_ok=True) + + token = smartsim.log.ctx_exp_path.set(test_dir) + + try: + logger = smartsim.log.get_logger("SmartSimTest", "INFO") + + faux_out_stream = io.StringIO() + logger.addHandler(logging.StreamHandler(faux_out_stream)) + + log_msg = "testing in a test!" + err_msg = "erroring in a test!" + logger.info(log_msg) + logger.error(err_msg) + + # ensure that the default stream is written to + logged = faux_out_stream.getvalue() + + assert log_msg in logged + assert err_msg in logged + + out_file, err_file = smartsim.log.get_exp_log_paths() + + out_content = out_file.read_text() + err_content = err_file.read_text() + + # ensure the low-pass filter logs non-errors to out file + assert log_msg in out_content + assert err_msg not in out_content + assert str(test_dir) in out_content + + # ensure the errors are logged to err file + assert err_msg in err_content + assert log_msg not in err_content + assert str(err_msg) in err_content + finally: + smartsim.log.ctx_exp_path.reset(token) + + +def test_context_leak(test_dir: str, turn_on_tm, monkeypatch): + """Ensure that exceptions do not leave the context in an invalid state""" + test_dir = pathlib.Path(test_dir) + test_dir.mkdir(parents=True, exist_ok=True) + + original_ctx_value = test_dir / pathlib.Path("some value") + ctx_var = smartsim.log.ctx_exp_path + token = ctx_var.set(original_ctx_value) + + err_msg = "some ex occurred in JobManager" + + def thrower(_self): + raise Exception(err_msg) + + try: + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.control.jobmanager.JobManager, "start", thrower) + exp = Experiment("MyExperiment", launcher="local", exp_path=str(test_dir)) + + sleep_rs = exp.create_run_settings("sleep", ["2"]) + sleep_rs.set_nodes(1) + sleep_rs.set_tasks(1) + + sleep = exp.create_model("SleepModel", sleep_rs) + exp.generate(sleep) + exp.start(sleep, block=True) + except Exception as ex: + assert err_msg in ex.args + finally: + assert ctx_var.get() == original_ctx_value + ctx_var.reset(token) + assert ctx_var.get() == "" diff --git a/tests/test_lsf_parser.py b/tests/test_lsf_parser.py index f41de54d8..abd27eb5a 100644 --- a/tests/test_lsf_parser.py +++ b/tests/test_lsf_parser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_lsf_settings.py b/tests/test_lsf_settings.py index a71d658cb..fcb351648 100644 --- a/tests/test_lsf_settings.py +++ b/tests/test_lsf_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_manifest.py b/tests/test_manifest.py index ea9920fad..33fc6b163 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,6 +40,7 @@ _LaunchedManifestMetadata as LaunchedManifestMetadata, ) from smartsim.database import Orchestrator +from smartsim.entity.dbobject import DBModel, DBScript from smartsim.error import SmartSimError from smartsim.settings import RunSettings @@ -61,6 +62,9 @@ orc_1.name = "orc2" model_no_name = exp.create_model(name=None, run_settings=rs) +db_script = DBScript("some-script", "def main():\n print('hello world')\n") +db_model = DBModel("some-model", "TORCH", b"some-model-bytes") + def test_separate(): manifest = Manifest(model, ensemble, orc) @@ -106,6 +110,38 @@ class Person: _ = Manifest(p) +@pytest.mark.parametrize( + "patch, has_db_objects", + [ + pytest.param((), False, id="No DB Objects"), + pytest.param((model, "_db_models", [db_model]), True, id="Model w/ DB Model"), + pytest.param( + (model, "_db_scripts", [db_script]), True, id="Model w/ DB Script" + ), + pytest.param( + (ensemble, "_db_models", [db_model]), True, id="Ensemble w/ DB Model" + ), + pytest.param( + (ensemble, "_db_scripts", [db_script]), True, id="Ensemble w/ DB Script" + ), + pytest.param( + (ensemble.entities[0], "_db_models", [db_model]), + True, + id="Ensemble Member w/ DB Model", + ), + pytest.param( + (ensemble.entities[0], "_db_scripts", [db_script]), + True, + id="Ensemble Member w/ DB Script", + ), + ], +) +def test_manifest_detects_db_objects(monkeypatch, patch, has_db_objects): + if patch: + monkeypatch.setattr(*patch) + assert Manifest(model, ensemble).has_db_objects == has_db_objects + + def test_launched_manifest_transform_data(): models = [(model, 1), (model_2, 2)] ensembles = [(ensemble, [(m, i) for i, m in enumerate(ensemble.entities)])] diff --git a/tests/test_model.py b/tests/test_model.py index 88700ad23..a1b5ba505 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_modelwriter.py b/tests/test_modelwriter.py index 4554a8b5a..a857d7c5f 100644 --- a/tests/test_modelwriter.py +++ b/tests/test_modelwriter.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_mpi_settings.py b/tests/test_mpi_settings.py index 4e0bc48c8..7d8db6e75 100644 --- a/tests/test_mpi_settings.py +++ b/tests/test_mpi_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_multidb.py b/tests/test_multidb.py index c4336294e..af21f5a1e 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_orc_config_settings.py b/tests/test_orc_config_settings.py index f08467be0..365596496 100644 --- a/tests/test_orc_config_settings.py +++ b/tests/test_orc_config_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 4a1b08367..f87aa9331 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -256,54 +256,6 @@ def test_orc_results_in_correct_number_of_shards(single_cmd): ) -###### Cobalt ###### - - -def test_cobalt_set_run_arg(wlmutils): - orc = Orchestrator( - wlmutils.get_test_port(), - db_nodes=3, - batch=False, - interface="lo", - launcher="cobalt", - run_command="aprun", - ) - orc.set_run_arg("account", "ACCOUNT") - assert all( - [db.run_settings.run_args["account"] == "ACCOUNT" for db in orc.entities] - ) - orc.set_run_arg("pes-per-numa-node", "2") - assert all( - ["pes-per-numa-node" not in db.run_settings.run_args for db in orc.entities] - ) - - -def test_cobalt_set_batch_arg(wlmutils): - orc = Orchestrator( - wlmutils.get_test_port(), - db_nodes=3, - batch=False, - interface="lo", - launcher="cobalt", - run_command="aprun", - ) - with pytest.raises(SmartSimError): - orc.set_batch_arg("account", "ACCOUNT") - - orc2 = Orchestrator( - wlmutils.get_test_port(), - db_nodes=3, - batch=True, - interface="lo", - launcher="cobalt", - run_command="aprun", - ) - orc2.set_batch_arg("account", "ACCOUNT") - assert orc2.batch_settings.batch_args["account"] == "ACCOUNT" - orc2.set_batch_arg("outputprefix", "new_output/") - assert "outputprefix" not in orc2.batch_settings.batch_args - - ###### LSF ###### diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index 2cd725f65..8bc23d14d 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_pbs_parser.py b/tests/test_pbs_parser.py index 554780cd7..f77eb7c93 100644 --- a/tests/test_pbs_parser.py +++ b/tests/test_pbs_parser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_pbs_settings.py b/tests/test_pbs_settings.py index ed450bd82..cefe3de4e 100644 --- a/tests/test_pbs_settings.py +++ b/tests/test_pbs_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index 0faa92242..554e42cbd 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_run_settings.py b/tests/test_run_settings.py index 7bcd6d874..b9439f41a 100644 --- a/tests/test_run_settings.py +++ b/tests/test_run_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -97,7 +97,7 @@ def test_create_run_settings_local(): id=f"{l}/orterun", ), ) - for l in ("local", "pbs", "slurm", "lsf", "cobalt") + for l in ("local", "pbs", "slurm", "lsf") ) ), # Except for launchers that implement their own MPI settings diff --git a/tests/test_serialize.py b/tests/test_serialize.py index 167e7e445..9e92a4866 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ from smartsim._core.utils import serialize from smartsim.database.orchestrator import Orchestrator -_REL_MANIFEST_PATH = f"{serialize.TELMON_SUBDIR}/{serialize.MANIFEST_FILENAME}" _CFG_TM_ENABLED_ATTR = "telemetry_enabled" # The tests in this file belong to the group_b group @@ -54,10 +53,14 @@ def turn_on_tm(monkeypatch): yield -def test_serialize_creates_a_manifest_json_file_if_dne(test_dir): +@pytest.fixture +def manifest_json(test_dir, config) -> str: + return Path(test_dir) / config.telemetry_subdir / serialize.MANIFEST_FILENAME + + +def test_serialize_creates_a_manifest_json_file_if_dne(test_dir, manifest_json): lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") serialize.save_launch_manifest(lmb.finalize()) - manifest_json = Path(test_dir) / _REL_MANIFEST_PATH assert manifest_json.is_file() with open(manifest_json, "r") as f: @@ -69,7 +72,7 @@ def test_serialize_creates_a_manifest_json_file_if_dne(test_dir): def test_serialize_does_not_write_manifest_json_if_telemetry_monitor_is_off( - test_dir, monkeypatch + test_dir, monkeypatch, manifest_json ): monkeypatch.setattr( smartsim._core.config.config.Config, @@ -78,12 +81,10 @@ def test_serialize_does_not_write_manifest_json_if_telemetry_monitor_is_off( ) lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") serialize.save_launch_manifest(lmb.finalize()) - manifest_json = Path(test_dir) / _REL_MANIFEST_PATH assert not manifest_json.exists() -def test_serialize_appends_a_manifest_json_exists(test_dir): - manifest_json = Path(test_dir) / _REL_MANIFEST_PATH +def test_serialize_appends_a_manifest_json_exists(test_dir, manifest_json): serialize.save_launch_manifest( LaunchedManifestBuilder("exp", test_dir, "launcher").finalize() ) @@ -102,8 +103,7 @@ def test_serialize_appends_a_manifest_json_exists(test_dir): assert len({run["run_id"] for run in manifest["runs"]}) == 3 -def test_serialize_overwites_file_if_not_json(test_dir): - manifest_json = Path(test_dir) / _REL_MANIFEST_PATH +def test_serialize_overwites_file_if_not_json(test_dir, manifest_json): manifest_json.parent.mkdir(parents=True, exist_ok=True) with open(manifest_json, "w") as f: f.write("This is not a json\n") @@ -114,10 +114,8 @@ def test_serialize_overwites_file_if_not_json(test_dir): assert isinstance(json.load(f), dict) -def test_started_entities_are_serialized(test_dir): +def test_started_entities_are_serialized(test_dir, manifest_json): exp_name = "test-exp" - test_dir = Path(test_dir) / exp_name - test_dir.mkdir(parents=True) exp = Experiment(exp_name, exp_path=str(test_dir), launcher="local") rs1 = exp.create_run_settings("echo", ["hello", "world"]) @@ -131,7 +129,6 @@ def test_started_entities_are_serialized(test_dir): exp.start(hello_world_model, spam_eggs_model, block=False) exp.start(hello_ensemble, block=False) - manifest_json = Path(exp.exp_path) / _REL_MANIFEST_PATH try: with open(manifest_json, "r") as f: manifest = json.load(f) diff --git a/tests/test_shell_util.py b/tests/test_shell_util.py index 7b7ac55b7..24f6b023c 100644 --- a/tests/test_shell_util.py +++ b/tests/test_shell_util.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_slurm_get_alloc.py b/tests/test_slurm_get_alloc.py index 270bbf014..aa12ce362 100644 --- a/tests/test_slurm_get_alloc.py +++ b/tests/test_slurm_get_alloc.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -33,7 +33,7 @@ def test_get_alloc_format(): - time = "10:00:00" + time = "10:00:70" nodes = 5 account = "A35311" options = {"ntasks-per-node": 5} @@ -45,7 +45,7 @@ def test_get_alloc_format(): "-J", "SmartSim", "-t", - "10:00:00", + "10:01:10", "-A", "A35311", "--ntasks-per-node=5", diff --git a/tests/test_slurm_parser.py b/tests/test_slurm_parser.py index 30c6c5b31..b5f7cf32a 100644 --- a/tests/test_slurm_parser.py +++ b/tests/test_slurm_parser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_slurm_settings.py b/tests/test_slurm_settings.py index d6bfd5063..aa5b2be11 100644 --- a/tests/test_slurm_settings.py +++ b/tests/test_slurm_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_slurm_validation.py b/tests/test_slurm_validation.py index c3f796ba6..02baddce6 100644 --- a/tests/test_slurm_validation.py +++ b/tests/test_slurm_validation.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_smartredis.py b/tests/test_smartredis.py index 2f234c217..282e708cc 100644 --- a/tests/test_smartredis.py +++ b/tests/test_smartredis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_step_info.py b/tests/test_step_info.py index eee920192..ec589ae76 100644 --- a/tests/test_step_info.py +++ b/tests/test_step_info.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py index 3f804b077..ac3599d7d 100644 --- a/tests/test_telemetry_monitor.py +++ b/tests/test_telemetry_monitor.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -93,12 +93,13 @@ def turn_on_tm(monkeypatch): yield -def snooze_nonblocking(test_dir: str, max_delay: int = 20, post_data_delay: int = 2): - telmon_subdir = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR +def snooze_nonblocking( + test_dir: pathlib.Path, max_delay: int = 20, post_data_delay: int = 2 +): # let the non-blocking experiment complete. for _ in range(max_delay): time.sleep(1) - if telmon_subdir.exists(): + if test_dir.exists(): time.sleep(post_data_delay) break @@ -179,7 +180,7 @@ def test_track_event( assert expected_output.is_file() -def test_load_manifest(fileutils: FileUtils, test_dir: str): +def test_load_manifest(fileutils: FileUtils, test_dir: str, config: cfg.Config): """Ensure that the runtime manifest loads correctly""" sample_manifest_path = fileutils.get_test_conf_path("telemetry/telemetry.json") sample_manifest = pathlib.Path(sample_manifest_path) @@ -187,7 +188,7 @@ def test_load_manifest(fileutils: FileUtils, test_dir: str): test_manifest_path = fileutils.make_test_file( serialize.MANIFEST_FILENAME, - pathlib.Path(test_dir) / serialize.TELMON_SUBDIR, + pathlib.Path(test_dir) / config.telemetry_subdir, sample_manifest.read_text(), ) test_manifest = pathlib.Path(test_manifest_path) @@ -431,7 +432,7 @@ def is_alive(self) -> bool: assert observer.stop_count == 1 -def test_telemetry_single_model(fileutils, test_dir, wlmutils): +def test_telemetry_single_model(fileutils, test_dir, wlmutils, config): """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp with unique db_identifiers""" @@ -446,7 +447,7 @@ def test_telemetry_single_model(fileutils, test_dir, wlmutils): exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create run settings - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -456,7 +457,7 @@ def test_telemetry_single_model(fileutils, test_dir, wlmutils): exp.start(smartsim_model, block=True) assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -464,7 +465,9 @@ def test_telemetry_single_model(fileutils, test_dir, wlmutils): assert len(stop_events) == 1 -def test_telemetry_single_model_nonblocking(fileutils, test_dir, wlmutils, monkeypatch): +def test_telemetry_single_model_nonblocking( + fileutils, test_dir, wlmutils, monkeypatch, config +): """Ensure that the telemetry monitor logs exist when the experiment is non-blocking""" with monkeypatch.context() as ctx: @@ -481,7 +484,7 @@ def test_telemetry_single_model_nonblocking(fileutils, test_dir, wlmutils, monke exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create run settings - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -490,11 +493,11 @@ def test_telemetry_single_model_nonblocking(fileutils, test_dir, wlmutils, monke exp.generate(smartsim_model) exp.start(smartsim_model) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -502,7 +505,7 @@ def test_telemetry_single_model_nonblocking(fileutils, test_dir, wlmutils, monke assert len(stop_events) == 1 -def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch): +def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch, config): """ Test telemetry with models being run in serial (one after each other) """ @@ -520,7 +523,7 @@ def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch): exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create run settings - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -534,7 +537,7 @@ def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch): [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] ) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -543,7 +546,7 @@ def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch): def test_telemetry_serial_models_nonblocking( - fileutils, test_dir, wlmutils, monkeypatch + fileutils, test_dir, wlmutils, monkeypatch, config ): """ Test telemetry with models being run in serial (one after each other) @@ -563,7 +566,7 @@ def test_telemetry_serial_models_nonblocking( exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create run settings - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -574,13 +577,13 @@ def test_telemetry_serial_models_nonblocking( exp.generate(*smartsim_models) exp.start(*smartsim_models) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) assert all( [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] ) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -588,7 +591,7 @@ def test_telemetry_serial_models_nonblocking( assert len(stop_events) == 5 -def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch): +def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config): """ Test telemetry with only a database running """ @@ -609,12 +612,14 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch): # create regular database orc = exp.create_database(port=test_port, interface=test_interface) exp.generate(orc) + + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir + try: exp.start(orc, block=True) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -622,7 +627,7 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch): assert len(stop_events) <= 1 finally: exp.stop(orc) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) assert exp.get_status(orc)[0] == STATUS_CANCELLED @@ -630,7 +635,7 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch): assert len(stop_events) == 1 -def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch): +def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, config): """ Test telemetry with only a non-generated database running """ @@ -651,13 +656,13 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch): # create regular database orc = exp.create_database(port=test_port, interface=test_interface) orc.set_path(test_dir) + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir try: exp.start(orc) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -666,14 +671,14 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch): finally: exp.stop(orc) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) assert exp.get_status(orc)[0] == STATUS_CANCELLED stop_events = list(telemetry_output_path.rglob("stop.json")) assert len(stop_events) == 1 -def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch): +def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch, config): """ Test telemetry with only a database and a model running """ @@ -700,7 +705,7 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch): exp.start(orc) # create run settings - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -711,13 +716,12 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch): finally: exp.stop(orc) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) assert exp.get_status(orc)[0] == STATUS_CANCELLED assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR - start_events = list(telemetry_output_path.rglob("database/**/start.json")) stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) @@ -730,7 +734,7 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch): assert len(stop_events) == 1 -def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch): +def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch, config): """ Test telemetry with only an ensemble """ @@ -748,7 +752,7 @@ def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch): # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -757,8 +761,8 @@ def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch): exp.start(ens, block=True) assert all([status == STATUS_COMPLETED for status in exp.get_status(ens)]) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -766,7 +770,7 @@ def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch): assert len(stop_events) == 5 -def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch): +def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch, config): """ Test telemetry with only a colocated model running """ @@ -797,7 +801,7 @@ def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch): [status == STATUS_COMPLETED for status in exp.get_status(smartsim_model)] ) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -814,7 +818,9 @@ def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch): pytest.param(1, 15, id="15s shutdown"), ], ) -def test_telemetry_autoshutdown(test_dir, wlmutils, monkeypatch, frequency, cooldown): +def test_telemetry_autoshutdown( + test_dir, wlmutils, monkeypatch, frequency, cooldown, config +): """ Ensure that the telemetry monitor process shuts down after the desired cooldown period @@ -837,7 +843,7 @@ def test_telemetry_autoshutdown(test_dir, wlmutils, monkeypatch, frequency, cool stop_time = start_time exp.start(block=False) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir empty_mani = list(telemetry_output_path.rglob("manifest.json")) assert len(empty_mani) == 1, "an manifest.json should be created" @@ -867,8 +873,8 @@ def get_launch_cmd(self): @pytest.fixture -def mock_step_meta_dict(test_dir): - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR +def mock_step_meta_dict(test_dir, config): + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir yield { "entity_type": "mock", "status_dir": telemetry_output_path, @@ -958,6 +964,7 @@ def test_multistart_experiment( test_dir: str, monkeypatch: pytest.MonkeyPatch, run_command: str, + config: cfg.Config, ): """Run an experiment with multiple start calls to ensure that telemetry is saved correctly for each run @@ -1016,7 +1023,7 @@ def test_multistart_experiment( assert tm_pid == exp._control._telemetry_monitor.pid time.sleep(3) # time for telmon to write db stop event - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir db_start_events = list(telemetry_output_path.rglob("database/**/start.json")) db_stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) diff --git a/tutorials/getting_started/getting_started.ipynb b/tutorials/getting_started/getting_started.ipynb index a80443564..0a5230b0f 100644 --- a/tutorials/getting_started/getting_started.ipynb +++ b/tutorials/getting_started/getting_started.ipynb @@ -36,7 +36,6 @@ "The `Experiment` also needs to have a `launcher` specified. Launchers provide SmartSim the ability to construct and execute complex workloads on HPC systems with schedulers (workload managers) like Slurm, or PBS. SmartSim currently supports\n", " * `slurm`\n", " * `pbs`\n", - " * `cobalt`\n", " * `lsf`\n", " * `local` (single node/laptops)\n", " * `auto`\n", @@ -809,7 +808,7 @@ "module = torch.jit.trace(net, example_forward_input)\n", "\n", "# Save the traced model to a file\n", - "torch.jit.save(module, \"./torch_cnn.pt\") " + "torch.jit.save(module, \"./torch_cnn.pt\")" ] }, { @@ -982,7 +981,7 @@ "source": [ "rs_prod = exp.create_run_settings(\"python\", f\"producer.py --redis-port {REDIS_PORT}\")\n", "ensemble = exp.create_ensemble(name=\"producer\",\n", - " replicas=2, \n", + " replicas=2,\n", " run_settings=rs_prod)" ] }, diff --git a/tutorials/ml_inference/Inference-in-SmartSim.ipynb b/tutorials/ml_inference/Inference-in-SmartSim.ipynb index 384c46d69..711ae999c 100644 --- a/tutorials/ml_inference/Inference-in-SmartSim.ipynb +++ b/tutorials/ml_inference/Inference-in-SmartSim.ipynb @@ -74,7 +74,7 @@ "\n", "Build SmartSim dependencies (Redis, RedisAI, ML runtimes)\n", "\n", - "optional arguments:\n", + "options:\n", " -h, --help show this help message and exit\n", " -v Enable verbose build process\n", " --device {cpu,gpu} Device to build ML runtimes for\n", @@ -129,16 +129,16 @@ "\n", "ML Backends Requested\n", "╒════════════╤════════╤══════╕\n", - "│ PyTorch │ 1.11.0 │ \u001b[32mTrue\u001b[0m │\n", - "│ TensorFlow │ 2.8.0 │ \u001b[32mTrue\u001b[0m │\n", - "│ ONNX │ 1.11.0 │ \u001b[32mTrue\u001b[0m │\n", + "│ PyTorch │ 2.0.1 │ \u001b[32mTrue\u001b[0m │\n", + "│ TensorFlow │ 2.13.1 │ \u001b[32mTrue\u001b[0m │\n", + "│ ONNX │ 1.14.1 │ \u001b[32mTrue\u001b[0m │\n", "╘════════════╧════════╧══════╛\n", "\n", "Building for GPU support: \u001b[31mFalse\u001b[0m\n", "\n", "\u001b[34m[SmartSim]\u001b[0m \u001b[1;30mINFO\u001b[0m Building RedisAI version 1.2.7 from https://github.com/RedisAI/RedisAI.git/\n", "\u001b[34m[SmartSim]\u001b[0m \u001b[1;30mINFO\u001b[0m ML Backends and RedisAI build complete!\n", - "\u001b[34m[SmartSim]\u001b[0m \u001b[1;30mINFO\u001b[0m Tensorflow, Torch, Onnxruntime backend(s) built\n", + "\u001b[34m[SmartSim]\u001b[0m \u001b[1;30mINFO\u001b[0m Tensorflow, Onnxruntime, Torch backend(s) built\n", "\u001b[34m[SmartSim]\u001b[0m \u001b[1;30mINFO\u001b[0m SmartSim build complete!\n" ] } @@ -351,48 +351,46 @@ "name": "stdout", "output_type": "stream", "text": [ - "SmartRedis Library@23-56-41:WARNING: Environment variable SR_LOG_FILE is not set. Defaulting to stdout\n", - "SmartRedis Library@23-56-41:WARNING: Environment variable SR_LOG_LEVEL is not set. Defaulting to INFO\n", - "Prediction: [[-2.3274555 -2.3253717 -2.354757 -2.3729622 -2.3431003 -2.1907542\n", - " -2.3514638 -2.1824958 -2.3210742 -2.2772176]\n", - " [-2.319342 -2.3146112 -2.370425 -2.372699 -2.3437245 -2.1988375\n", - " -2.354674 -2.1797025 -2.3205185 -2.2724082]\n", - " [-2.316474 -2.3222082 -2.354598 -2.3659394 -2.3442194 -2.203955\n", - " -2.3561926 -2.1938426 -2.3158035 -2.2702417]\n", - " [-2.3319743 -2.311106 -2.356003 -2.3770962 -2.333499 -2.1953351\n", - " -2.3548756 -2.195049 -2.310809 -2.2787712]\n", - " [-2.3205962 -2.3178282 -2.3519592 -2.3816493 -2.3516834 -2.1981795\n", - " -2.3636622 -2.1777525 -2.3139138 -2.2705152]\n", - " [-2.3096914 -2.3222034 -2.3647196 -2.3790689 -2.3540542 -2.206103\n", - " -2.350227 -2.1878397 -2.3078933 -2.2638521]\n", - " [-2.3328648 -2.3219166 -2.3527567 -2.3824098 -2.3419397 -2.1949291\n", - " -2.3534136 -2.1831408 -2.31838 -2.2653728]\n", - " [-2.3125417 -2.324307 -2.3541815 -2.379772 -2.348383 -2.2018006\n", - " -2.3614779 -2.1773078 -2.322288 -2.2653532]\n", - " [-2.3261974 -2.3169107 -2.3658333 -2.372918 -2.3417373 -2.1894612\n", - " -2.3535395 -2.2018242 -2.308719 -2.268019 ]\n", - " [-2.316616 -2.3056076 -2.355318 -2.3717446 -2.346278 -2.1928883\n", - " -2.3632033 -2.2028553 -2.3090112 -2.2805274]\n", - " [-2.3209507 -2.3127859 -2.358682 -2.3774037 -2.3558414 -2.2000623\n", - " -2.3439143 -2.1920927 -2.3196788 -2.2638488]\n", - " [-2.3159695 -2.3109243 -2.356306 -2.374135 -2.3412004 -2.1999855\n", - " -2.3728766 -2.1851294 -2.3103416 -2.2791054]\n", - " [-2.320004 -2.3205712 -2.3569424 -2.3752837 -2.3463457 -2.1887283\n", - " -2.3645942 -2.1946917 -2.3067377 -2.272361 ]\n", - " [-2.310819 -2.3274822 -2.356091 -2.3715394 -2.3474889 -2.200722\n", - " -2.3434677 -2.1957805 -2.3201551 -2.2701602]\n", - " [-2.3143158 -2.31956 -2.358585 -2.362682 -2.3464782 -2.196579\n", - " -2.3578608 -2.2015376 -2.3066673 -2.2789493]\n", - " [-2.318907 -2.3225117 -2.3634868 -2.3806338 -2.344084 -2.1920872\n", - " -2.3534818 -2.1955805 -2.3039575 -2.2711294]\n", - " [-2.3084583 -2.3254113 -2.3642344 -2.3710778 -2.3496058 -2.192245\n", - " -2.3604536 -2.1796546 -2.310007 -2.286219 ]\n", - " [-2.3140576 -2.3124697 -2.359347 -2.379842 -2.3481016 -2.1948602\n", - " -2.3681424 -2.1851056 -2.3161757 -2.2693238]\n", - " [-2.3162746 -2.3137376 -2.3598473 -2.3751001 -2.3536685 -2.1899457\n", - " -2.3560162 -2.1918488 -2.3077402 -2.2818694]\n", - " [-2.3138344 -2.3119657 -2.3552136 -2.3767023 -2.3556495 -2.187487\n", - " -2.3484402 -2.1922355 -2.3236399 -2.2809098]]\n" + "Prediction: [[-2.1860428 -2.3318565 -2.2773128 -2.2742267 -2.2679536 -2.304159\n", + " -2.423439 -2.3406057 -2.2474668 -2.3950338]\n", + " [-2.1803837 -2.3286302 -2.2805855 -2.2874444 -2.261593 -2.3145547\n", + " -2.4357762 -2.3169715 -2.2618299 -2.3798223]\n", + " [-2.1833746 -2.3249795 -2.28497 -2.2851245 -2.2555952 -2.308204\n", + " -2.4274755 -2.3441646 -2.2553194 -2.3779805]\n", + " [-2.1843016 -2.3395848 -2.2619352 -2.294549 -2.2571433 -2.312943\n", + " -2.4161577 -2.338785 -2.2538524 -2.3881512]\n", + " [-2.1936755 -2.3315516 -2.2739122 -2.2832148 -2.2666094 -2.3038912\n", + " -2.4211216 -2.3300066 -2.2564852 -2.3846986]\n", + " [-2.1709712 -2.3271346 -2.280365 -2.286064 -2.2617233 -2.3227994\n", + " -2.4253702 -2.3313646 -2.2593162 -2.383301 ]\n", + " [-2.1948013 -2.3318067 -2.2713811 -2.2844 -2.2526758 -2.3178148\n", + " -2.4255004 -2.3233378 -2.2388031 -2.4088087]\n", + " [-2.17515 -2.3240736 -2.2818787 -2.2857373 -2.259629 -2.3184\n", + " -2.425821 -2.3519678 -2.2413275 -2.385761 ]\n", + " [-2.187554 -2.3335872 -2.2767708 -2.2818003 -2.2654893 -2.3097534\n", + " -2.4182632 -2.3376188 -2.2509694 -2.384327 ]\n", + " [-2.1793714 -2.340681 -2.271785 -2.287751 -2.2620957 -2.3163543\n", + " -2.4111845 -2.3468175 -2.2472064 -2.3842056]\n", + " [-2.1906679 -2.3483853 -2.2580595 -2.2923894 -2.25718 -2.2951608\n", + " -2.431815 -2.3487022 -2.2326546 -2.3963163]\n", + " [-2.1882055 -2.3293467 -2.2767649 -2.279892 -2.2527165 -2.3220086\n", + " -2.4226239 -2.3364902 -2.2455037 -2.394776 ]\n", + " [-2.1756573 -2.3318045 -2.2690601 -2.2737868 -2.264148 -2.3212118\n", + " -2.4243867 -2.3421402 -2.2562728 -2.390894 ]\n", + " [-2.1824148 -2.3317673 -2.2749603 -2.291667 -2.2524009 -2.3026595\n", + " -2.42986 -2.3290846 -2.265264 -2.387787 ]\n", + " [-2.1871543 -2.3408008 -2.2773213 -2.283908 -2.249834 -2.3159058\n", + " -2.4251873 -2.339211 -2.245001 -2.3839695]\n", + " [-2.1855574 -2.3216138 -2.2722392 -2.2826352 -2.2573392 -2.308948\n", + " -2.4348576 -2.3421624 -2.2397952 -2.4060655]\n", + " [-2.1876159 -2.330091 -2.2779942 -2.2849102 -2.2582757 -2.3122754\n", + " -2.4250498 -2.333003 -2.250753 -2.3871331]\n", + " [-2.182653 -2.3381891 -2.2795184 -2.287199 -2.2628696 -2.303869\n", + " -2.413879 -2.3404965 -2.26254 -2.3739154]\n", + " [-2.1733668 -2.3377435 -2.2724369 -2.28559 -2.2537165 -2.3127556\n", + " -2.4249415 -2.3484716 -2.2515364 -2.3897333]\n", + " [-2.1839535 -2.336417 -2.2839231 -2.285238 -2.2608624 -2.3198016\n", + " -2.424396 -2.3165755 -2.2433887 -2.3935702]]\n" ] } ], @@ -420,8 +418,8 @@ "source": [ "As we gave the CNN random noise, the predictions reflect that.\n", "\n", - "If running on CPU, be sure to change the argument in the ``set_model`` call\n", - "above to ``CPU``." + "If running on GPU, be sure to change the argument in the ``set_model`` call\n", + "above to ``device=\"GPU\"``." ] }, { @@ -468,46 +466,46 @@ "name": "stdout", "output_type": "stream", "text": [ - "U: [[[-0.550159 0.8065786 ]\n", - " [-0.52288723 -0.5346357 ]\n", - " [-0.6510868 -0.2521817 ]]\n", + "U: [[[-0.31189808 0.86989427]\n", + " [-0.48122275 -0.49140105]\n", + " [-0.81923395 -0.0425336 ]]\n", "\n", - " [[-0.17983183 -0.20003092]\n", - " [-0.5534476 -0.7888692 ]\n", - " [-0.81323797 0.58109635]]\n", + " [[-0.5889101 -0.29554686]\n", + " [-0.43949458 -0.66398275]\n", + " [-0.6782547 0.68686163]]\n", "\n", - " [[-0.20800859 0.42269117]\n", - " [-0.65485084 -0.7300564 ]\n", - " [-0.7265692 0.53698224]]\n", + " [[-0.61623317 0.05853765]\n", + " [-0.6667615 -0.5695148 ]\n", + " [-0.4191489 0.81989413]]\n", "\n", - " [[-0.336111 0.77894354]\n", - " [-0.31149226 0.43854192]\n", - " [-0.8888205 -0.44825 ]]\n", + " [[-0.5424681 0.8400398 ]\n", + " [-0.31990844 -0.2152339 ]\n", + " [-0.77678 -0.49800384]]\n", "\n", - " [[-0.6365824 0.7635661 ]\n", - " [-0.2663487 -0.08588188]\n", - " [-0.723755 -0.639993 ]]]\n", + " [[-0.43667376 0.8088193 ]\n", + " [-0.70812154 -0.57906115]\n", + " [-0.5548693 0.10246649]]]\n", "\n", - ", S: [[137.34267 54.616768]\n", - " [142.89323 35.937744]\n", - " [ 90.98083 48.821 ]\n", - " [ 86.74378 31.835794]\n", - " [146.14839 36.327038]]\n", + ", S: [[137.10924 25.710997]\n", + " [131.49983 37.79937 ]\n", + " [178.72423 24.792084]\n", + " [125.13014 49.733784]\n", + " [137.48834 53.57199 ]]\n", "\n", - ", V: [[[-0.48165366 0.8763617 ]\n", - " [-0.8763617 -0.48165366]]\n", + ", V: [[[-0.8333395 0.5527615 ]\n", + " [-0.5527615 -0.8333395 ]]\n", "\n", - " [[-0.47905296 0.8777859 ]\n", - " [-0.8777859 -0.47905296]]\n", + " [[-0.5085228 -0.8610485 ]\n", + " [-0.8610485 0.5085228 ]]\n", "\n", - " [[-0.737007 -0.67588514]\n", - " [-0.67588514 0.737007 ]]\n", + " [[-0.8650402 0.5017025 ]\n", + " [-0.5017025 -0.8650402 ]]\n", "\n", - " [[-0.28137407 0.9595981 ]\n", - " [-0.9595981 -0.28137407]]\n", + " [[-0.56953645 0.8219661 ]\n", + " [-0.8219661 -0.56953645]]\n", "\n", - " [[-0.5767642 -0.8169106 ]\n", - " [-0.8169106 0.5767642 ]]]\n", + " [[-0.6115895 0.79117525]\n", + " [-0.79117525 -0.6115895 ]]]\n", "\n" ] } @@ -594,8 +592,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "[[0.03525018 0.04472604 0.02831913 0.1114466 0.25944078 0.11165252\n", - " 0.2983908 0.04830809 0.02390536 0.03856055]]\n" + "[[0.05032112 0.06484107 0.03512685 0.14747524 0.14440396 0.02395445\n", + " 0.03395916 0.06222691 0.26738793 0.1703033 ]]\n" ] } ], @@ -657,8 +655,6 @@ "\n", "And PyTorch has its own converter.\n", "\n", - "Currently the ONNX backend only works on Linux, but MacOS support will be added in the future.\n", - "\n", "Below are some examples of a few models in [Scikit-learn](https://scikit-learn.org)\n", "that are converted into ONNX format for use with SmartSim. To use ONNX in SmartSim, specify\n", "`ONNX` as the argument for *backend* in the call to `client.set_model` or\n", @@ -801,15 +797,7 @@ "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23:56:50 C02G13RYMD6N SmartSim[33744] INFO Stopping model orchestrator_0 with job name orchestrator_0-CVIG02IVGHO0\n" - ] - } - ], + "outputs": [], "source": [ "exp.stop(db)" ] @@ -830,12 +818,12 @@ " Name Entity-Type JobID RunID Time Status Returncode \n", "\n", "\n", - "0 orchestrator_0DBNode 35628 0 29.7008Cancelled-9 \n", + "0 orchestrator_0DBNode 31857 0 32.7161Cancelled0 \n", "\n", "" ], "text/plain": [ - "'\\n\\n\\n\\n\\n\\n\\n
Name Entity-Type JobID RunID Time Status Returncode
0 orchestrator_0DBNode 35628 0 29.7008Cancelled-9
'" + "'\\n\\n\\n\\n\\n\\n\\n
Name Entity-Type JobID RunID Time Status Returncode
0 orchestrator_0DBNode 31857 0 32.7161Cancelled0
'" ] }, "execution_count": 19, @@ -844,7 +832,7 @@ } ], "source": [ - "exp.summary(format=\"html\")" + "exp.summary(style=\"html\")" ] }, { @@ -901,24 +889,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "23:56:50 C02G13RYMD6N SmartSim[33744] INFO \n", + "21:18:06 C02G13RYMD6N SmartSim[30945] INFO \n", "\n", "=== Launch Summary ===\n", "Experiment: Inference-Tutorial\n", - "Experiment Path: /Users/mrdro/repos/ssimdev/ss/tutorials/ml_inference/Inference-Tutorial\n", + "Experiment Path: /Users/smartsim/smartsim/tutorials/ml_inference/Inference-Tutorial\n", "Launcher: local\n", "Models: 1\n", "Database Status: inactive\n", "\n", "=== Models ===\n", "colocated_model\n", - "Executable: /Users/mrdro/miniconda3/envs/smartsim/bin/python\n", + "Executable: /Users/smartsim/venv/bin/python\n", "Executable Arguments: ./colo-db-torch-example.py\n", "Co-located Database: True\n", "\n", "\n", "\n", - "23:56:52 C02G13RYMD6N SmartSim[33744] INFO colocated_model(35666): Completed\n" + "21:18:09 C02G13RYMD6N SmartSim[30945] INFO colocated_model(31865): Completed\n" ] } ], @@ -942,13 +930,13 @@ " Name Entity-Type JobID RunID Time Status Returncode \n", "\n", "\n", - "0 orchestrator_0 DBNode 35628 0 29.7008Cancelled-9 \n", - "1 colocated_modelModel 35666 0 2.1590 Completed0 \n", + "0 orchestrator_0 DBNode 31857 0 32.7161Cancelled0 \n", + "1 colocated_modelModel 31865 0 3.5862 Completed0 \n", "\n", "" ], "text/plain": [ - "'\\n\\n\\n\\n\\n\\n\\n\\n
Name Entity-Type JobID RunID Time Status Returncode
0 orchestrator_0 DBNode 35628 0 29.7008Cancelled-9
1 colocated_modelModel 35666 0 2.1590 Completed0
'" + "'\\n\\n\\n\\n\\n\\n\\n\\n
Name Entity-Type JobID RunID Time Status Returncode
0 orchestrator_0 DBNode 31857 0 32.7161Cancelled0
1 colocated_modelModel 31865 0 3.5862 Completed0
'" ] }, "execution_count": 22, @@ -957,7 +945,7 @@ } ], "source": [ - "exp.summary(format=\"html\")" + "exp.summary(style=\"html\")" ] } ], @@ -977,7 +965,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.17" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/tutorials/online_analysis/lattice/online_analysis.ipynb b/tutorials/online_analysis/lattice/online_analysis.ipynb index 48ddf6032..3389b1190 100644 --- a/tutorials/online_analysis/lattice/online_analysis.ipynb +++ b/tutorials/online_analysis/lattice/online_analysis.ipynb @@ -345,7 +345,7 @@ "source": [ "## Post-processing with TorchScript\n", "\n", - "We can upload [TorchScript functions](https://pytorch.org/docs/1.11/jit.html) to the DB. Tensors which are stored on the DB can be passed as arguments to uploaded functions and the results will be stored on the DB. This makes it possible to perform pre- and post-processing operations on tensors localli, *in the DB*, reducing the number of data transfers.\n", + "We can upload [TorchScript functions](https://pytorch.org/docs/2.0/jit.html) to the DB. Tensors which are stored on the DB can be passed as arguments to uploaded functions and the results will be stored on the DB. This makes it possible to perform pre- and post-processing operations on tensors localli, *in the DB*, reducing the number of data transfers.\n", "\n", "### Uploading a script\n", "We can load a file containing TorchScript-compatible functionsto the DB. For example, the file `./probe.script` contains the function `probe_points` which interpolates the values of `ux` and `uy` at some user-provided probe points. This is useful when we are interested in the value of a given fields only at specific locations.\n",