diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index 288561c398..714eb38191 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -1,7 +1,7 @@ # # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -39,11 +39,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: ref: doc path: doc-branch diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0b9b973c35..ad9a55e034 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,7 +1,7 @@ # # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -56,8 +56,8 @@ jobs: os: [ubuntu-20.04, macos-12] steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 - name: Install cibuildwheel run: python -m pip install cibuildwheel>=2.12.3 @@ -93,9 +93,9 @@ jobs: name: Build source distribution runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 name: Install Python with: python-version: '3.8' diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index fc817cba9d..79466b9025 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -1,7 +1,7 @@ # # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,6 +34,10 @@ on: branches: - develop +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + env: HOMEBREW_NO_ANALYTICS: "ON" # Make Homebrew installation a little quicker HOMEBREW_NO_AUTO_UPDATE: "ON" @@ -53,15 +57,14 @@ jobs: os: [macos-12, ubuntu-20.04] # Operating systems compiler: [8] # GNU compiler version rai: [1.2.7] # Redis AI versions - py_v: [3.8, 3.9, '3.10'] # Python versions - + py_v: ['3.8', '3.9', '3.10', '3.11'] # Python versions env: SMARTSIM_REDISAI: ${{ matrix.rai }} steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.py_v }} @@ -101,19 +104,12 @@ jobs: # on developments of the client are brought in. - name: Install SmartSim (with ML backends) run: | - python -m pip install git+https://github.com/CrayLabs/SmartRedis.git@develop#egg=smartredis python -m pip install .[dev,ml] - - name: Install ML Runtimes with Smart (with pt, tf, and onnx support) - if: (matrix.py_v != '3.10') run: smart build --device cpu --onnx -v - - name: Install ML Runtimes with Smart (with pt and tf support) - if: (matrix.py_v == '3.10') - run: smart build --device cpu -v - - name: Run mypy run: | python -m pip install .[mypy] @@ -122,6 +118,15 @@ jobs: - name: Run Pylint run: make check-lint + # Run isort/black style check + - name: Run isort + run: isort --check-only --profile black ./smartsim ./tests + + # Run isort/black style check + - name: Run black + run: | + black --exclude smartsim/version.py --check ./smartsim ./tests + # Run pytest (backends subdirectory) - name: Run Pytest if: (matrix.subset == 'backends') @@ -151,7 +156,7 @@ jobs: retention-days: 5 - name: Upload Pytest coverage to Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v3.1.4 with: fail_ci_if_error: false files: ./coverage.xml diff --git a/.pylintrc b/.pylintrc index da0886ba20..f2fa17bab4 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/.wci.yml b/.wci.yml index 55b5ddda10..fd4ae0c1c8 100644 --- a/.wci.yml +++ b/.wci.yml @@ -10,7 +10,7 @@ Machine Learning (ML) libraries, like PyTorch and TensorFlow, in combination with High Performance Computing (HPC) simulations and applications. SmartSim launches ML infrastructure on HPC systems alongside user workloads - and supports most HPC workload managers (e.g. Slurm, PBSPro, LSF, Cobalt). + and supports most HPC workload managers (e.g. Slurm, PBSPro, LSF). SmartSim also provides a set of client libraries in Python, C++, C, and Fortran. These client libraries allow users to send and receive data between user applications and the machine learning infrastructure. Moreover, the @@ -22,8 +22,8 @@ language: Python release: - version: 0.6.0 - date: 2023-12-18 + version: 0.6.1 + date: 2024-02-15 documentation: general: https://www.craylabs.org/docs/overview.html @@ -41,7 +41,6 @@ - Slurm - PBSPro - LSF - - Cobalt - Linux/MacOS transfer_protocols: - TCP/IP diff --git a/LICENSE.md b/LICENSE.md index 9312d5762a..7e5e1594be 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2021-2023, Hewlett Packard Enterprise +Copyright (c) 2021-2024, Hewlett Packard Enterprise All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/Makefile b/Makefile index fef69eab3a..d8a2f0e6b3 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -66,6 +66,10 @@ clobber: clean # help: # help: Style # help: ------- +# help: check-all - Performs all the style-related checks +.PHONY: check-all +check-all: check-style check-format check-sort-imports check-lint check-mypy + $(info All style checks PASSED) # help: style - Sort imports and format with black .PHONY: style @@ -146,11 +150,11 @@ tutorials-dev: @docker compose build tutorials-dev @docker run -p 8888:8888 smartsim-tutorials:dev-latest -# help: tutorials-prod - Build and start a docker container to run the tutorials (v0.6.0) +# help: tutorials-prod - Build and start a docker container to run the tutorials (v0.6.1) .PHONY: tutorials-prod tutorials-prod: @docker compose build tutorials-prod - @docker run -p 8888:8888 smartsim-tutorials:v0.6.0 + @docker run -p 8888:8888 smartsim-tutorials:v0.6.1 # help: diff --git a/README.md b/README.md index df671ef022..cfd8d42714 100644 --- a/README.md +++ b/README.md @@ -100,8 +100,8 @@ before using it on your system. Each tutorial is a Jupyter notebook that can be which will run a jupyter lab with the tutorials, SmartSim, and SmartRedis installed. ```bash -docker pull ghcr.io/craylabs/smartsim-tutorials:v0.4.1 -docker run -p 8888:8888 ghcr.io/craylabs/smartsim-tutorials:v0.4.1 +docker pull ghcr.io/craylabs/smartsim-tutorials:latest +docker run -p 8888:8888 ghcr.io/craylabs/smartsim-tutorials:latest # click on link to open jupyter lab ``` @@ -179,7 +179,6 @@ launch capabilities for all applications. - Slurm - LSF - PBSPro - - Cobalt - Local (for laptops/single node, no batch) @@ -198,7 +197,7 @@ qsub -l select=3:ncpus=20 -l walltime=00:10:00 -l place=scatter -I -q bsub -Is -W 00:10 -nnodes 3 -P $SHELL ``` -This same script will run on a SLURM, PBS, LSF, or Cobalt system as the ``launcher`` +This same script will run on a SLURM, PBS, or LSF system as the ``launcher`` is set to `auto` in the [Experiment](https://www.craylabs.org/docs/api/smartsim_api.html#experiment) initialization. The run command like ``mpirun``, ``aprun`` or ``srun`` will be automatically detected from what is available on the @@ -277,8 +276,8 @@ print(exp.get_status(ensemble)) python hello_ensemble.py ``` -Similar to the interactive example, this same script will run on a SLURM, PBS, LSF, -or Cobalt system as the ``launcher`` is set to `auto` in the +Similar to the interactive example, this same script will run on a SLURM, PBS, +or LSF system as the ``launcher`` is set to `auto` in the [Experiment](https://www.craylabs.org/docs/api/smartsim_api.html#experiment) initialization. Local launching does not support batch workloads. @@ -452,8 +451,8 @@ Each tutorial is a Jupyter notebook that can be run through the which will run a jupyter lab with the tutorials, SmartSim, and SmartRedis installed. ```bash -docker pull ghcr.io/craylabs/smartsim-tutorials:v1 -docker run -p 8888:8888 ghcr.io/craylabs/smartsim-tutorials:v0.4.1 +docker pull ghcr.io/craylabs/smartsim-tutorials:latest +docker run -p 8888:8888 ghcr.io/craylabs/smartsim-tutorials:latest ``` Each of the following examples can be found in the [SmartSim documentation](https://www.craylabs.org/docs/tutorials/getting_started/getting_started.html). @@ -640,15 +639,15 @@ from C, C++, Fortran and Python with the SmartRedis Clients: 1.2.7 PyTorch - 1.11.x + 2.0.1 TensorFlow\Keras - 2.8.x + 2.13.1 ONNX - 1.11.x + 1.14.1 diff --git a/conftest.py b/conftest.py index ff4e56ee12..b5a4fd70be 100644 --- a/conftest.py +++ b/conftest.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -101,7 +101,7 @@ def print_test_configuration() -> None: def pytest_configure() -> None: pytest.test_launcher = test_launcher - pytest.wlm_options = ["slurm", "pbs", "cobalt", "lsf", "pals"] + pytest.wlm_options = ["slurm", "pbs", "lsf", "pals"] account = get_account() pytest.test_account = account pytest.test_device = test_device @@ -153,12 +153,7 @@ def kill_all_test_spawned_processes() -> None: def get_hostlist() -> t.Optional[t.List[str]]: global test_hostlist if not test_hostlist: - if "COBALT_NODEFILE" in os.environ: - try: - return _parse_hostlist_file(os.environ["COBALT_NODEFILE"]) - except FileNotFoundError: - return None - elif "PBS_NODEFILE" in os.environ and test_launcher == "pals": + if "PBS_NODEFILE" in os.environ and test_launcher == "pals": # with PALS, we need a hostfile even if `aprun` is available try: return _parse_hostlist_file(os.environ["PBS_NODEFILE"]) @@ -269,19 +264,6 @@ def get_base_run_settings( run_args = {"--np": ntasks, "--hostfile": host_file} run_args.update(kwargs) return RunSettings(exe, args, run_command="mpiexec", run_args=run_args) - if test_launcher == "cobalt": - if shutil.which("aprun"): - run_command = "aprun" - run_args = {"--pes": ntasks} - else: - run_command = "mpirun" - host_file = os.environ["COBALT_NODEFILE"] - run_args = {"-n": ntasks, "--hostfile": host_file} - run_args.update(kwargs) - settings = RunSettings( - exe, args, run_command=run_command, run_args=run_args - ) - return settings if test_launcher == "lsf": run_args = {"--np": ntasks, "--nrs": nodes} run_args.update(kwargs) @@ -289,7 +271,7 @@ def get_base_run_settings( return settings if test_launcher != "local": raise SSConfigError( - "Base run settings are available for Slurm, PBS, Cobalt, " + "Base run settings are available for Slurm, PBS, " f"and LSF, but launcher was {test_launcher}" ) # TODO allow user to pick aprun vs MPIrun @@ -320,18 +302,6 @@ def get_run_settings( run_args = {"np": ntasks, "hostfile": host_file} run_args.update(kwargs) return PalsMpiexecSettings(exe, args, run_args=run_args) - # TODO allow user to pick aprun vs MPIrun - if test_launcher == "cobalt": - if shutil.which("aprun"): - run_args = {"pes": ntasks} - run_args.update(kwargs) - return AprunSettings(exe, args, run_args=run_args) - - host_file = os.environ["COBALT_NODEFILE"] - run_args = {"n": ntasks, "hostfile": host_file} - run_args.update(kwargs) - return MpirunSettings(exe, args, run_args=run_args) - if test_launcher == "lsf": run_args = { "nrs": nodes, @@ -344,7 +314,7 @@ def get_run_settings( @staticmethod def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: - if test_launcher in ["pbs", "cobalt"]: + if test_launcher == "pbs": if not shutil.which("aprun"): hostlist = get_hostlist() else: @@ -698,3 +668,7 @@ def setup_test_colo( assert colo_model.colocated # Check to make sure that limit_db_cpus made it into the colo settings return colo_model + +@pytest.fixture +def config() -> smartsim._core.config.Config: + return CONFIG diff --git a/doc/_static/custom_tab_style.css b/doc/_static/custom_tab_style.css new file mode 100644 index 0000000000..f31e13667c --- /dev/null +++ b/doc/_static/custom_tab_style.css @@ -0,0 +1,7 @@ +.sphinx-tabs-panel { + background-color: inherit; +} + +.sphinx-tabs-tab[aria-selected="true"] { + background-color: inherit; +} \ No newline at end of file diff --git a/doc/_static/version_names.json b/doc/_static/version_names.json index 8ae78ebdb3..7b49ea2ccc 100644 --- a/doc/_static/version_names.json +++ b/doc/_static/version_names.json @@ -1,7 +1,8 @@ { "version_names":[ "develop (unstable)", - "0.6.0 (stable)", + "0.6.1 (stable)", + "0.6.0", "0.5.1", "0.5.0", "0.4.2", @@ -12,6 +13,7 @@ "version_urls": [ "https://www.craylabs.org/develop/overview.html", "https://www.craylabs.org/docs/overview.html", + "https://www.craylabs.org/docs/versions/0.6.0/overview.html", "https://www.craylabs.org/docs/versions/0.5.1/overview.html", "https://www.craylabs.org/docs/versions/0.5.0/overview.html", "https://www.craylabs.org/docs/versions/0.4.2/overview.html", diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index 5136c8aa5d..adf7081ecc 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -43,8 +43,8 @@ Settings are provided to ``Model`` and ``Ensemble`` objects to provide parameters for how a job should be executed. Some are specifically meant for certain launchers like ``SbatchSettings`` is solely meant for system using Slurm as a workload manager. -``MpirunSettings`` for OpenMPI based jobs is supported by Slurm, -PBSPro, and Cobalt. +``MpirunSettings`` for OpenMPI based jobs is supported by Slurm +and PBSPro. Types of Settings: @@ -60,7 +60,6 @@ Types of Settings: JsrunSettings SbatchSettings QsubBatchSettings - CobaltBatchSettings BsubBatchSettings Settings objects can accept a container object that defines a container @@ -137,7 +136,7 @@ AprunSettings ``AprunSettings`` can be used on any system that supports the Cray ALPS layer. SmartSim supports using ``AprunSettings`` -on PBSPro and Cobalt WLM systems. +on PBSPro WLM systems. ``AprunSettings`` can be used in interactive session (on allocation) and within batch launches (e.g., ``QsubBatchSettings``) @@ -204,7 +203,7 @@ MpirunSettings ``MpirunSettings`` are for launching with OpenMPI. ``MpirunSettings`` are -supported on Slurm, PBSpro, and Cobalt. +supported on Slurm and PBSpro. .. autosummary:: @@ -231,7 +230,7 @@ MpiexecSettings ``MpiexecSettings`` are for launching with OpenMPI's ``mpiexec``. ``MpirunSettings`` are -supported on Slurm, PBSpro, and Cobalt. +supported on Slurm and PBSpro. .. autosummary:: @@ -258,7 +257,7 @@ OrterunSettings ``OrterunSettings`` are for launching with OpenMPI's ``orterun``. ``OrterunSettings`` are -supported on Slurm, PBSpro, and Cobalt. +supported on Slurm and PBSpro. .. autosummary:: @@ -336,32 +335,6 @@ be launched as a batch on PBSPro systems. :members: -.. _cqsub_api: - - -CobaltBatchSettings -------------------- - -``CobaltBatchSettings`` are used to configure jobs that should -be launched as a batch on Cobalt Systems. They closely mimic -that of the ``QsubBatchSettings`` for PBSPro. - - -.. autosummary:: - - CobaltBatchSettings.set_account - CobaltBatchSettings.set_batch_command - CobaltBatchSettings.set_nodes - CobaltBatchSettings.set_queue - CobaltBatchSettings.set_walltime - CobaltBatchSettings.format_batch_args - -.. autoclass:: CobaltBatchSettings - :inherited-members: - :undoc-members: - :members: - - .. _bsub_api: BsubBatchSettings diff --git a/doc/changelog.rst b/doc/changelog.rst index befb9ee37d..e114556240 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -11,12 +11,118 @@ Jump to :ref:`SmartRedis Changelog ` SmartSim ======== +0.6.1 +----- + +Released on 15 February, 2024 + +Description + +- Duplicate for DBModel/Script prevented +- Update license to include 2024 +- Telemetry monitor is now active by default +- Add support for Mac OSX on Apple Silicon +- Remove Torch warnings during testing +- Validate Slurm timing format +- Expose Python Typehints +- Fix test_logs to prevent generation of directory +- Fix Python Typehint for colocated database settings +- Python 3.11 Support +- Quality of life `smart validate` improvements +- Remove Cobalt support +- Enrich logging through context variables +- Upgrade Machine Learning dependencies +- Override sphinx-tabs background color +- Add concurrency group to test workflow +- Fix index when installing torch through smart build -Development branch ------------------- -To be released at some future point in time +Detailed Notes +- Modify the `git clone` for both Redis and RedisAI to set the line endings to + unix-style line endings when using MacOS on ARM. (SmartSim-PR482_) +- Separate install instructions are now provided for Mac OSX on x64 vs ARM64 (SmartSim-PR479_) +- Prevent duplicate ML model and script names being added to an + Ensemble member if the names exists. (SmartSim-PR475_) +- Updates `Copyright (c) 2021-2023` to `Copyright (c) 2021-2024` + in all of the necessary files. (SmartSim-PR485_) +- Bug fix which prevents the expected behavior when the `SMARTSIM_LOG_LEVEL` + environment variable was set to `developer`. (SmartSim-PR473_) +- Sets the default value of the "enable telemetry" flag to on. + Bumps the output `manifest.json` version number to match that of + `smartdashboard` and pins a watchdog version to avoid build errors. + (SmartSim-PR477_) +- Refactor logic of `Manifest.has_db_objects` to remove excess branching + and improve readability/maintainability. (SmartSim-PR476_) +- SmartSim can now be built and used on platforms using Apple Silicon + (ARM64). Currently, only the PyTorch backend is supported. Note that libtorch + will be downloaded from a CrayLabs github repo. (SmartSim-PR465_) +- Tests that were saving Torch models were emitting warnings. These warnings + were addressed by updating the model save test function. (SmartSim-PR472_) +- Validate the timing format when requesting a slurm allocation. (SmartSim-PR471_) +- Add and ship `py.typed` marker to expose inline type hints. Fix + type errors related to SmartRedis. (SmartSim-PR468_) +- Fix the `test_logs.py::test_context_leak` test that was + erroneously creating a directory named `some value` in SmartSim's root + directory. (SmartSim-PR467_) +- Add Python type hinting to colocated settings. (SmartSim-PR462_) +- Add github actions for running black and isort checks. (SmartSim-PR464_) +- Relax the required version of `typing_extensions`. (SmartSim-PR459_) +- Addition of Python 3.11 to SmartSim. (SmartSim-PR461_) +- Quality of life `smart validate` improvements such as setting `CUDA_VISIBLE_DEVICES` + environment variable within `smart validate` prior to importing any ML deps to + prevent false negatives on multi-GPU systems. Additionally, move SmartRedis logs + from standard out to dedicated log file in the validation temporary directory as well as + suppress `sklearn` deprecation warning by pinning `KMeans` constructor + argument. Lastly, move TF test to last as TF may reserve the GPUs it uses. + (SmartSim-PR458_) +- Some actions in the current GitHub CI/CD workflows were outdated. They were + replaced with the latest versions. (SmartSim-PR446_) +- As the Cobalt workload manager is not used on any system we are aware of, + its support in SmartSim was terminated and classes such as `CobaltLauncher` have + been removed. (SmartSim-PR448_) +- Experiment logs are written to a file that can be read by the dashboard. (SmartSim-PR452_) +- Updated SmartSim's machine learning backends to PyTorch 2.0.1, Tensorflow + 2.13.1, ONNX 1.14.1, and ONNX Runtime 1.16.1. As a result of this change, + there is now an available ONNX wheel for use with Python 3.10, and wheels for + all of SmartSim's machine learning backends with Python 3.11. + (SmartSim-PR451_) (SmartSim-PR461_) +- The sphinx-tabs documentation extension uses a white background for the tabs component. + A custom CSS for those components to inherit the overall theme color has + been added. (SmartSim-PR453_) +- Add concurrency groups to GitHub's CI/CD workflows, preventing + multiple workflows from the same PR to be launched concurrently. + (SmartSim-PR439_) +- Torch changed their preferred indexing when trying to install + their provided wheels. Updated the `pip install` command within + `smart build` to ensure that the appropriate packages can be found. + (SmartSim-PR449_) + + +.. _SmartSim-PR485: https://github.com/CrayLabs/SmartSim/pull/485 +.. _SmartSim-PR482: https://github.com/CrayLabs/SmartSim/pull/482 +.. _SmartSim-PR479: https://github.com/CrayLabs/SmartSim/pull/479 +.. _SmartSim-PR477: https://github.com/CrayLabs/SmartSim/pull/477 +.. _SmartSim-PR476: https://github.com/CrayLabs/SmartSim/pull/476 +.. _SmartSim-PR475: https://github.com/CrayLabs/SmartSim/pull/475 +.. _SmartSim-PR473: https://github.com/CrayLabs/SmartSim/pull/473 +.. _SmartSim-PR472: https://github.com/CrayLabs/SmartSim/pull/472 +.. _SmartSim-PR471: https://github.com/CrayLabs/SmartSim/pull/471 +.. _SmartSim-PR468: https://github.com/CrayLabs/SmartSim/pull/468 +.. _SmartSim-PR467: https://github.com/CrayLabs/SmartSim/pull/467 +.. _SmartSim-PR465: https://github.com/CrayLabs/SmartSim/pull/465 +.. _SmartSim-PR464: https://github.com/CrayLabs/SmartSim/pull/464 +.. _SmartSim-PR462: https://github.com/CrayLabs/SmartSim/pull/462 +.. _SmartSim-PR461: https://github.com/CrayLabs/SmartSim/pull/461 +.. _SmartSim-PR459: https://github.com/CrayLabs/SmartSim/pull/459 +.. _SmartSim-PR458: https://github.com/CrayLabs/SmartSim/pull/458 +.. _SmartSim-PR453: https://github.com/CrayLabs/SmartSim/pull/453 +.. _SmartSim-PR452: https://github.com/CrayLabs/SmartSim/pull/452 +.. _SmartSim-PR451: https://github.com/CrayLabs/SmartSim/pull/451 +.. _SmartSim-PR449: https://github.com/CrayLabs/SmartSim/pull/449 +.. _SmartSim-PR448: https://github.com/CrayLabs/SmartSim/pull/448 +.. _SmartSim-PR446: https://github.com/CrayLabs/SmartSim/pull/446 +.. _SmartSim-PR439: https://github.com/CrayLabs/SmartSim/pull/439 0.6.0 ----- @@ -434,7 +540,7 @@ Expand Machine Learning Library Support: Expand Launcher Setting Options: - - Add ability to use base ``RunSettings`` on a Slurm, PBS, or Cobalt launchers (SmartSim-PR90_) + - Add ability to use base ``RunSettings`` on a Slurm, or PBS launchers (SmartSim-PR90_) - Add ability to use base ``RunSettings`` on LFS launcher (SmartSim-PR108_) Deprecations and Breaking Changes diff --git a/doc/conf.py b/doc/conf.py index 908b9534fb..d5b6f21da9 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -19,14 +19,14 @@ # -- Project information ----------------------------------------------------- project = 'SmartSim' -copyright = '2021-2023, Hewlett Packard Enterprise' +copyright = '2021-2024, Hewlett Packard Enterprise' author = 'Cray Labs' try: import smartsim version = smartsim.__version__ except ImportError: - version = "0.6.0" + version = "0.6.1" # The full version, including alpha/beta/rc tags release = version @@ -100,6 +100,11 @@ "extra_footer": extra_footer, } +# Use a custom style sheet to avoid the sphinx-tabs extension from using +# white background with dark themes. If sphinx-tabs updates its +# static/tabs.css, this may need to be updated. +html_css_files = ['custom_tab_style.css'] + autoclass_content = 'both' add_module_names = False diff --git a/doc/developer.rst b/doc/developer.rst index 4009819c3c..632ee8d45b 100644 --- a/doc/developer.rst +++ b/doc/developer.rst @@ -84,14 +84,14 @@ Local ===== There are two levels of testing in SmartSim. The first runs by default and does -not launch any jobs out onto a system through a workload manager like Cobalt. +not launch any jobs out onto a system through a workload manager like Slurm. If any of the above commands are used, the test suite will run the "light" test suite by default. -PBSPro, Slurm, Cobalt, LSF -========================== +PBSPro, Slurm, LSF +================== To run the full test suite, users will have to be on a system with one of the above workload managers. Additionally, users will need to obtain an allocation @@ -105,9 +105,6 @@ of at least 3 nodes. # for PBSPro (with aprun) qsub -l select=3 -l place=scatter -l walltime=00:10:00 -q queue - # for Cobalt (with aprun) - qsub -n 3 -t 00:10:00 -A account -q queue -I - # for LSF (with jsrun) bsub -Is -W 00:30 -nnodes 3 -P project $SHELL @@ -117,7 +114,6 @@ Once in an iterative allocation, users will need to set the test launcher environment variable: ``SMARTSIM_TEST_LAUNCHER`` to one of the following values - slurm - - cobalt - pbs - lsf - local diff --git a/doc/experiment.rst b/doc/experiment.rst index f7950d6d60..986db4cadc 100644 --- a/doc/experiment.rst +++ b/doc/experiment.rst @@ -38,8 +38,8 @@ available compute resources on the system. Each launcher supports specific types of ``RunSettings``. - :ref:`SrunSettings ` for Slurm - - :ref:`AprunSettings ` for PBSPro and Cobalt - - :ref:`MpirunSettings ` for OpenMPI with `mpirun` on PBSPro, Cobalt, LSF, and Slurm + - :ref:`AprunSettings ` for PBSPro + - :ref:`MpirunSettings ` for OpenMPI with `mpirun` on PBSPro, LSF, and Slurm - :ref:`JsrunSettings ` for LSF These settings can be manually specified by the user, or auto-detected by the @@ -181,7 +181,6 @@ workload manager and available compute resources. - :ref:`SbatchSettings ` for Slurm - :ref:`QsubBatchSettings ` for PBSPro - - :ref:`CobaltBatchSettings ` for Cobalt - :ref:`BsubBatchSettings ` for LSF If it only passed ``RunSettings``, ``Ensemble``, objects will require either diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst index 3874eb9610..2f43db50f6 100644 --- a/doc/installation_instructions/basic.rst +++ b/doc/installation_instructions/basic.rst @@ -18,7 +18,7 @@ Basic The base prerequisites to install SmartSim and SmartRedis are: - - Python 3.8-3.10 + - Python 3.8-3.11 - Pip - Cmake 3.13.x (or later) - C compiler @@ -33,6 +33,13 @@ The base prerequisites to install SmartSim and SmartRedis are: GCC 5-9, 11, and 12 is recommended. There are known bugs with GCC 10. +.. warning:: + + Apple Clang 15 seems to have issues on MacOS with Apple Silicon. Please modify + your path to ensure that a version of GCC installed by brew has priority. Note + this seems to be hardcoded to `gcc` and `g++` in the Redis build so ensure that + `which gcc g++` do not point to Apple Clang. + GPU Support =========== @@ -41,7 +48,7 @@ The machine-learning backends have additional requirements in order to use GPUs for inference - `CUDA Toolkit 11 (tested with 11.8) `_ - - `cuDNN 8 (tested with 8.2.1 and 8.4.0) `_ + - `cuDNN 8 (tested with 8.9.1) `_ - OS: Linux - GPU: Nvidia @@ -63,19 +70,20 @@ Supported Versions - GPU - Python Versions * - MacOS - - x86_64 + - x86_64, aarch64 - Not supported - - 3.8 - 3.10 + - 3.8 - 3.11 * - Linux - x86_64 - Nvidia - - 3.8 - 3.10 + - 3.8 - 3.11 .. note:: - Windows is not supported and there are currently no plans - to support Windows. + Users have succesfully run SmartSim on Windows using Windows Subsystem for Linux + with Nvidia support. Generally, users should follow the Linux instructions here, + however we make no guarantee or offer of support. Native support for various machine learning libraries and their @@ -84,15 +92,19 @@ versions is dictated by our dependency on RedisAI_ 1.2.7. +------------------+----------+-------------+---------------+ | RedisAI | PyTorch | Tensorflow | ONNX Runtime | +==================+==========+=============+===============+ -| 1.2.7 (default) | 1.11.0 | 2.8.0 | 1.11.1 | +| 1.2.7 (default) | 2.0.1 | 2.13.1 | 1.16.3 | +------------------+----------+-------------+---------------+ +.. warning:: + + On Apple Silicon, only the PyTorch backend is supported for now. Please contact us + if you need support for other backends + TensorFlow_ 2.0 and Keras_ are supported through `graph freezing`_. ScikitLearn_ and Spark_ models are supported by SmartSim as well through the use of the ONNX_ runtime (which is not built by -default due to issues with glibc on a variety of Linux -platforms and lack of support for Mac OS X). +default due to issues with glibc on a variety of Linux platforms). .. _Spark: https://spark.apache.org/mllib/ .. _Keras: https://keras.io @@ -106,7 +118,7 @@ platforms and lack of support for Mac OS X). ------------------------------------------------------------ MacOS-only -========== +============ We recommend users and contributors install brew_ for managing installed packages. For contributors, the following brew packages can be helpful: @@ -242,9 +254,9 @@ SmartSim does. * - Platform - Python Versions * - MacOS - - 3.7 - 3.10 + - 3.8 - 3.11 * - Linux - - 3.7 - 3.10 + - 3.8 - 3.11 The Python client for SmartRedis is installed through ``pip`` as follows: @@ -275,6 +287,7 @@ First, clone SmartSim. git clone https://github.com/CrayLabs/SmartSim smartsim + And then install SmartSim with pip in *editable* mode. This way, SmartSim is installed in your virtual environment and available on `sys.path`, but the source remains at the site of the clone instead of in site-packages. @@ -287,12 +300,29 @@ source remains at the site of the clone instead of in site-packages. Use the now installed ``smart`` cli to install the machine learning runtimes. -.. code-block:: bash +.. tabs:: + + .. tab:: Linux + + .. code-block:: bash + + # run one of the following + smart build --device cpu --onnx # install with cpu-only support + smart build --device gpu --onnx # install with both cpu and gpu support + + + .. tab:: MacOS (Intel x64) + + .. code-block:: bash + + smart build --device cpu --onnx # install all backends (PT, TF, ONNX) on gpu + + + .. tab:: MacOS (Apple Silicon) + + .. code-block:: bash - # run one of the following - smart build -v --device cpu # verbose install cpu - smart build -v --device gpu # verbose install gpu - smart build -v --device gpu --onnx # install all backends (PT, TF, ONNX) on gpu + smart build --device cpu --no_tf # Only install PyTorch (TF/ONNX unsupported) Build the SmartRedis library diff --git a/doc/installation_instructions/platform/olcf-summit.rst b/doc/installation_instructions/platform/olcf-summit.rst index dfd7b9666b..422c771b42 100644 --- a/doc/installation_instructions/platform/olcf-summit.rst +++ b/doc/installation_instructions/platform/olcf-summit.rst @@ -19,7 +19,7 @@ into problems. .. code-block:: bash # setup Python and build environment - export ENV_NAME=smartsim-0.6.0 + export ENV_NAME=smartsim-0.6.1 git clone https://github.com/CrayLabs/SmartRedis.git smartredis git clone https://github.com/CrayLabs/SmartSim.git smartsim conda config --prepend channels https://ftp.osuosl.org/pub/open-ce/1.6.1/ diff --git a/doc/launchers.rst b/doc/launchers.rst index 7d0c9970f5..22425071ed 100644 --- a/doc/launchers.rst +++ b/doc/launchers.rst @@ -16,9 +16,8 @@ SmartSim currently supports 5 `launchers`: 1. ``local``: for single-node, workstation, or laptop 2. ``slurm``: for systems using the Slurm scheduler 3. ``pbs``: for systems using the PBSpro scheduler - 4. ``cobalt``: for systems using the Cobalt scheduler - 5. ``lsf``: for systems using the LSF scheduler - 6. ``auto``: have SmartSim auto-detect the launcher to use. + 4. ``lsf``: for systems using the LSF scheduler + 5. ``auto``: have SmartSim auto-detect the launcher to use. To specify a specific launcher, one argument needs to be provided to the ``Experiment`` initialization. @@ -30,7 +29,6 @@ to the ``Experiment`` initialization. exp = Experiment("name-of-experiment", launcher="local") # local launcher exp = Experiment("name-of-experiment", launcher="slurm") # Slurm launcher exp = Experiment("name-of-experiment", launcher="pbs") # PBSpro launcher - exp = Experiment("name-of-experiment", launcher="cobalt") # Cobalt launcher exp = Experiment("name-of-experiment", launcher="lsf") # LSF launcher exp = Experiment("name-of-experiment", launcher="auto") # auto-detect launcher @@ -219,42 +217,10 @@ creation. --------------------------------------------------------------------- -Cobalt -====== - -The Cobalt Launcher works just like the PBSPro launcher and -is compatible with ALPS and OpenMPI workloads as well. - -To use the Cobalt launcher, specify at ``Experiment`` initialization: - -.. code-block:: python - - from smartsim import Experiment - - exp = Experiment("MOM6-double-gyre", launcher="cobalt") - - -Running on Cobalt ------------------ - -The Cobalt launcher supports three types of ``RunSettings``: - 1. :ref:`AprunSettings ` - 2. :ref:`MpirunSettings ` - 3. :ref:`MpiexecSettings ` - -As well as batch settings for ``qsub`` through: - 1. :ref:`CobaltBatchSettings ` - -Both supported ``RunSettings`` types above can be added -to a ``CobaltBatchSettings`` batch workload through ``Ensemble`` -creation. - ---------------------------------------------------------------------- - LSF === -The LSF Launcher works like the PBSPro and Cobalt launchers and +The LSF Launcher works like the PBSPro launcher and is compatible with LSF and OpenMPI workloads. To use the LSF launcher, specify at ``Experiment`` initialization: diff --git a/doc/ml_features.rst b/doc/ml_features.rst index 51027e7aec..6096f005e7 100644 --- a/doc/ml_features.rst +++ b/doc/ml_features.rst @@ -169,7 +169,7 @@ to the DB using the SmartRedis client. .. group-tab:: PyTorch - PyTorch requires models to be `jit-traced `__. + PyTorch requires models to be `jit-traced `__. The method ``torch.jit.save()`` can either store the model in memory or on file. Here, we will keep it in memory as a bytestring. @@ -239,7 +239,7 @@ it can be uploaded to the DB using the SmartRedis client. .. group-tab:: PyTorch - PyTorch requires models to be `jit-traced `__. + PyTorch requires models to be `jit-traced `__. The method ``torch.jit.save()`` can either store the model in memory or on file. Here, we will save it to a file located at ``./traced_model.pt``. diff --git a/doc/overview.rst b/doc/overview.rst index 3ef046bb0c..241d54eca3 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -61,8 +61,7 @@ The key features of the IL are: - An API to start, monitor, and stop HPC jobs from Python or from a Jupyter notebook. - Automated deployment of in-memory data staging (`Redis `_) and computational storage (`RedisAI `_). - - Programmatic launches of batch and in-allocation jobs on PBS, Slurm, LSF, - and Cobalt systems. + - Programmatic launches of batch and in-allocation jobs on PBS, Slurm, and LSF systems. - Creating and configuring ensembles of workloads with isolated communication channels. The IL can configure and launch batch jobs as well as jobs within interactive diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index 38d9c80527..e883a28058 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -6,9 +6,9 @@ sphinx-copybutton==0.5.2 sphinx-tabs==3.4.4 nbsphinx==0.9.3 docutils==0.18.1 -torch==1.11.0 -tensorflow==2.8.1 +torch==2.0.1 +tensorflow==2.13.1 ipython jinja2==3.1.2 protobuf -numpy \ No newline at end of file +numpy diff --git a/doc/testing.rst b/doc/testing.rst index bdaa473d70..ccb2db3c22 100644 --- a/doc/testing.rst +++ b/doc/testing.rst @@ -78,9 +78,6 @@ Examples of how to obtain allocations on systems with the launchers: # for PBSPro (with aprun) qsub -l select=4 -l place=scatter -l walltime=00:10:00 -q queue - # for Cobalt (with aprun) - qsub -n 4 -t 00:10:00 -A account -q queue -I - # for LSF (with jsrun) bsub -Is -W 00:30 -nnodes 4 -P project $SHELL @@ -91,7 +88,6 @@ launcher environment variable: ``SMARTSIM_TEST_LAUNCHER`` to one of the following values - slurm - - cobalt - pbs - lsf - local @@ -273,4 +269,3 @@ The actions are defined using yaml files are are located in the Each pull request, push and merge the test suite for SmartRedis and SmartSim are run. For SmartSim, this is the ``local`` test suite with the local launcher. - diff --git a/docker-compose.yml b/docker-compose.yml index c492e6324b..f69743f145 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,7 +18,7 @@ services: - "8888:8888" tutorials-prod: - image: smartsim-tutorials:v0.4.2 + image: smartsim-tutorials:v0.6.1 build: context: . dockerfile: ./docker/prod/Dockerfile diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile index 6a5f82642b..c643787c32 100644 --- a/docker/dev/Dockerfile +++ b/docker/dev/Dockerfile @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile index eee8099106..eff99de361 100644 --- a/docker/docs/dev/Dockerfile +++ b/docker/docs/dev/Dockerfile @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/docker/prod/Dockerfile b/docker/prod/Dockerfile index 628d9af60f..09e94dee02 100644 --- a/docker/prod/Dockerfile +++ b/docker/prod/Dockerfile @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,7 +46,7 @@ COPY --chown=craylabs:root ./tutorials/ /home/craylabs/tutorials/ USER craylabs RUN export PATH=/home/craylabs/.local/bin:$PATH && \ echo "export PATH=/home/craylabs/.local/bin:$PATH" >> /home/craylabs/.bashrc && \ - python -m pip install smartsim[ml]==0.6.0 jupyter jupyterlab matplotlib && \ + python -m pip install smartsim[ml]==0.6.1 jupyter jupyterlab matplotlib && \ smart build --device cpu -v && \ chown craylabs:root -R /home/craylabs/.local && \ rm -rf ~/.cache/pip diff --git a/docker/testing/Dockerfile b/docker/testing/Dockerfile index 1b59e00468..9c247c3201 100644 --- a/docker/testing/Dockerfile +++ b/docker/testing/Dockerfile @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/pyproject.toml b/pyproject.toml index 60c33bee5e..4415c63cac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -91,18 +91,32 @@ disallow_untyped_defs = true disallow_incomplete_defs = true disallow_untyped_decorators = true +# Probably Unintended Branches/Blocks +# warn_unreachable = true + # Safety/Upgrading Mypy warn_unused_ignores = true warn_redundant_casts = true warn_unused_configs = true show_error_codes = true +# Misc Strictness Settings +strict_concatenate = false +strict_equality = true + +# Additional Error Codes +enable_error_code = [ + # "redundant-expr", + # "possibly-undefined", + # "unused-awaitable", + # "ignore-without-code", + # "mutable-override", +] + [[tool.mypy.overrides]] # Ignore packages that are not used or not typed module = [ "coloredlogs", - "smartredis", - "smartredis.error", "redis.cluster", "keras", "torch", diff --git a/setup.cfg b/setup.cfg index 49419c7eb9..5fdfa82aeb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,6 +45,7 @@ classifiers = Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 License :: OSI Approved :: BSD License Intended Audience :: Science/Research Topic :: Scientific/Engineering @@ -55,7 +56,7 @@ setup_requires = setuptools>=39.2 cmake>=3.13 include_package_data = True -python_requires = >=3.8,<3.11 +python_requires = >=3.8,<3.12 [options.packages.find] include = @@ -67,5 +68,7 @@ exclude = smartredis [options.package_data] +smartsim = + py.typed smartsim._core.bin = * diff --git a/setup.py b/setup.py index 66a534456b..bc7cf60d66 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -167,7 +167,7 @@ def has_ext_modules(_placeholder): "tqdm>=4.50.2", "filelock>=3.4.2", "protobuf~=3.20", - "watchdog>=3.0.0", + "watchdog>=3.0.0,<4.0.0", ] # Add SmartRedis at specific version @@ -190,6 +190,7 @@ def has_ext_modules(_placeholder): "types-tqdm", "types-tensorflow==2.12.0.9", "types-setuptools", + "typing_extensions>=4.1.0", ], # see smartsim/_core/_install/buildenv.py for more details **versions.ml_extras_required() diff --git a/smartsim/__init__.py b/smartsim/__init__.py index d3f5062b8a..7c1fa2fe0a 100644 --- a/smartsim/__init__.py +++ b/smartsim/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/__init__.py b/smartsim/_core/__init__.py index b835c7a0e2..bbc108f480 100644 --- a/smartsim/_core/__init__.py +++ b/smartsim/_core/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/__init__.py b/smartsim/_core/_cli/__init__.py index bf6fd954c7..efe03908e0 100644 --- a/smartsim/_core/_cli/__init__.py +++ b/smartsim/_core/_cli/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/__main__.py b/smartsim/_core/_cli/__main__.py index 47df070483..66a50095ae 100644 --- a/smartsim/_core/_cli/__main__.py +++ b/smartsim/_core/_cli/__main__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index 474d96c8a4..443b916b76 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,6 +26,7 @@ import argparse import os +import platform import sys import typing as t from pathlib import Path @@ -60,14 +61,6 @@ def check_py_onnx_version(versions: Versioner) -> None: """Check Python environment for ONNX installation""" - if not versions.ONNX: - py_version = sys.version_info - msg = ( - "An onnx wheel is not available for " - f"Python {py_version.major}.{py_version.minor}. " - "Instead consider using Python 3.8 or 3.9 for ONNX 1.11 support" - ) - raise SetupError(msg) _check_packages_in_python_env( { "onnx": Version_(versions.ONNX), @@ -121,7 +114,12 @@ def build_database( # check database installation database_name = "KeyDB" if keydb else "Redis" database_builder = builder.DatabaseBuilder( - build_env(), build_env.MALLOC, build_env.JOBS, verbose + build_env(), + jobs=build_env.JOBS, + _os=builder.OperatingSystem.from_str(platform.system()), + architecture=builder.Architecture.from_str(platform.machine()), + malloc=build_env.MALLOC, + verbose=verbose, ) if not database_builder.is_built: logger.info( @@ -153,7 +151,7 @@ def build_redis_ai( backends_table = [ ["PyTorch", versions.TORCH, color_bool(use_torch)], ["TensorFlow", versions.TENSORFLOW, color_bool(use_tf)], - ["ONNX", versions.ONNX or "Unavailable", color_bool(use_onnx)], + ["ONNX", versions.ONNX, color_bool(use_onnx)], ] print(tabulate(backends_table, tablefmt="fancy_outline"), end="\n\n") print(f"Building for GPU support: {color_bool(device == 'gpu')}\n") @@ -181,12 +179,14 @@ def build_redis_ai( rai_builder = builder.RedisAIBuilder( build_env=build_env_dict, + jobs=build_env.JOBS, + _os=builder.OperatingSystem.from_str(platform.system()), + architecture=builder.Architecture.from_str(platform.machine()), torch_dir=str(torch_dir) if torch_dir else "", libtf_dir=str(libtf_dir) if libtf_dir else "", build_torch=use_torch, build_tf=use_tf, build_onnx=use_onnx, - jobs=build_env.JOBS, verbose=verbose, ) @@ -226,9 +226,10 @@ def build_redis_ai( logger.info("ML Backends and RedisAI build complete!") -def check_py_torch_version(versions: Versioner, device: _TDeviceStr = "cpu") -> None: +def check_py_torch_version(versions: Versioner, device_in: _TDeviceStr = "cpu") -> None: """Check Python environment for TensorFlow installation""" + device = device_in.lower() if BuildEnv.is_macos(): if device == "gpu": raise BuildError("SmartSim does not support GPU on MacOS") @@ -260,10 +261,11 @@ def check_py_torch_version(versions: Versioner, device: _TDeviceStr = "cpu") -> "Torch version not found in python environment. " "Attempting to install via `pip`" ) + wheel_device = device if device == "cpu" else device_suffix.replace("+", "") pip( "install", - "-f", - "https://download.pytorch.org/whl/torch_stable.html", + "--extra-index-url", + f"https://download.pytorch.org/whl/{wheel_device}", *(f"{package}=={version}" for package, version in torch_deps.items()), ) elif missing or conflicts: diff --git a/smartsim/_core/_cli/clean.py b/smartsim/_core/_cli/clean.py index d8a85f8a98..50e267d805 100644 --- a/smartsim/_core/_cli/clean.py +++ b/smartsim/_core/_cli/clean.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/cli.py b/smartsim/_core/_cli/cli.py index ef4c113e19..3cad573d18 100644 --- a/smartsim/_core/_cli/cli.py +++ b/smartsim/_core/_cli/cli.py @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/dbcli.py b/smartsim/_core/_cli/dbcli.py index ce0975bc4d..733c2fe4d4 100644 --- a/smartsim/_core/_cli/dbcli.py +++ b/smartsim/_core/_cli/dbcli.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/site.py b/smartsim/_core/_cli/site.py index c86e0341bb..386f642c0e 100644 --- a/smartsim/_core/_cli/site.py +++ b/smartsim/_core/_cli/site.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py index e31d0aed2a..8bf0984df6 100644 --- a/smartsim/_core/_cli/utils.py +++ b/smartsim/_core/_cli/utils.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index bda2548590..8ea40ae007 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -25,13 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import contextlib import io import multiprocessing as mp import os +import os.path import socket import tempfile import typing as t -from contextlib import contextmanager from types import TracebackType import numpy as np @@ -52,8 +53,6 @@ if t.TYPE_CHECKING: - # Pylint disables needed for old version of pylint w/ TF 2.6.2 - # pylint: disable-next=unused-import from multiprocessing.connection import Connection # pylint: disable-next=unsubscriptable-object @@ -89,12 +88,23 @@ def execute( simple experiment """ backends = installed_redisai_backends() + device: _TCapitalDeviceStr = args.device.upper() try: - with _VerificationTempDir(dir=os.getcwd()) as temp_dir: + with contextlib.ExitStack() as ctx: + temp_dir = ctx.enter_context(_VerificationTempDir(dir=os.getcwd())) + validate_env = { + "SR_LOG_LEVEL": os.environ.get("SR_LOG_LEVEL", "INFO"), + "SR_LOG_FILE": os.environ.get( + "SR_LOG_FILE", os.path.join(temp_dir, "smartredis.log") + ), + } + if device == "GPU": + validate_env["CUDA_VISIBLE_DEVICES"] = "0" + ctx.enter_context(_env_vars_set_to(validate_env)) test_install( location=temp_dir, port=args.port, - device=args.device.upper(), + device=device, with_tf="tensorflow" in backends, with_pt="torch" in backends, with_onnx="onnxruntime" in backends, @@ -147,18 +157,40 @@ def test_install( logger.info("Verifying Tensor Transfer") client.put_tensor("plain-tensor", np.ones((1, 1, 3, 3))) client.get_tensor("plain-tensor") - if with_tf: - logger.info("Verifying TensorFlow Backend") - _test_tf_install(client, location, device) if with_pt: logger.info("Verifying Torch Backend") _test_torch_install(client, device) if with_onnx: logger.info("Verifying ONNX Backend") _test_onnx_install(client, device) + if with_tf: # Run last in case TF locks an entire GPU + logger.info("Verifying TensorFlow Backend") + _test_tf_install(client, location, device) + logger.info("Success!") -@contextmanager +@contextlib.contextmanager +def _env_vars_set_to( + evars: t.Mapping[str, t.Optional[str]] +) -> t.Generator[None, None, None]: + envvars = tuple((var, os.environ.pop(var, None), val) for var, val in evars.items()) + for var, _, tmpval in envvars: + _set_or_del_env_var(var, tmpval) + try: + yield + finally: + for var, origval, _ in reversed(envvars): + _set_or_del_env_var(var, origval) + + +def _set_or_del_env_var(var: str, val: t.Optional[str]) -> None: + if val is not None: + os.environ[var] = val + else: + os.environ.pop(var, None) + + +@contextlib.contextmanager def _make_managed_local_orc( exp: Experiment, port: int ) -> t.Generator[Client, None, None]: @@ -243,9 +275,18 @@ def __init__(self) -> None: def forward(self, x: torch.Tensor) -> torch.Tensor: return self.conv(x) + if device == "GPU": + device_ = torch.device("cuda") + else: + device_ = torch.device("cpu") + net = Net() - forward_input = torch.rand(1, 1, 3, 3) + net.to(device_) + net.eval() + + forward_input = torch.rand(1, 1, 3, 3).to(device_) traced = torch.jit.trace(net, forward_input) # type: ignore[no-untyped-call] + buffer = io.BytesIO() torch.jit.save(traced, buffer) # type: ignore[no-untyped-call] model = buffer.getvalue() @@ -261,7 +302,7 @@ def _test_onnx_install(client: Client, device: _TCapitalDeviceStr) -> None: from sklearn.cluster import KMeans data = np.arange(20, dtype=np.float32).reshape(10, 2) - model = KMeans(n_clusters=2) + model = KMeans(n_clusters=2, n_init=10) model.fit(data) kmeans = to_onnx(model, data, target_opset=11) diff --git a/smartsim/_core/_install/__init__.py b/smartsim/_core/_install/__init__.py index bf6fd954c7..efe03908e0 100644 --- a/smartsim/_core/_install/__init__.py +++ b/smartsim/_core/_install/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index eaa2c68bd9..85090ba0ae 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -193,23 +193,17 @@ class RedisAIVersion(Version_): defaults = { "1.2.7": { - "tensorflow": "2.8.0", - "onnx": "1.11.0", - "skl2onnx": "1.11.1", - "onnxmltools": "1.11.1", - "scikit-learn": "1.1.1", - "torch": "1.11.0", + "tensorflow": "2.13.1", + "onnx": "1.14.1", + "skl2onnx": "1.16.0", + "onnxmltools": "1.12.0", + "scikit-learn": "1.3.2", + "torch": "2.0.1", "torch_cpu_suffix": "+cpu", - "torch_cuda_suffix": "+cu113", - "torchvision": "0.12.0", + "torch_cuda_suffix": "+cu117", + "torchvision": "0.15.2", }, } - # Remove options with unsported wheels for python>=3.10 - if sys.version_info >= (3, 10): - defaults["1.2.7"].pop("onnx") - defaults["1.2.7"].pop("skl2onnx") - defaults["1.2.7"].pop("onnxmltools") - defaults["1.2.7"].pop("scikit-learn") def __init__(self, vers: str) -> None: # pylint: disable=super-init-not-called min_rai_version = min(Version_(ver) for ver in self.defaults) @@ -276,8 +270,8 @@ class Versioner: PYTHON_MIN = Version_("3.8.0") # Versions - SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.6.0")) - SMARTREDIS = Version_(get_env("SMARTREDIS_VERSION", "0.5.0")) + SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.6.1")) + SMARTREDIS = Version_(get_env("SMARTREDIS_VERSION", "0.5.1")) SMARTSIM_SUFFIX = get_env("SMARTSIM_SUFFIX", "") # Redis @@ -304,33 +298,19 @@ class Versioner: # TensorFlow and ONNX only use the defaults, but these are not built into # the RedisAI package and therefore the user is free to pick other versions. TENSORFLOW = Version_(REDISAI.tensorflow) - try: - ONNX = Version_(REDISAI.onnx) - except AttributeError: - ONNX = None - - def as_dict(self, db_name: DbEngine = "REDIS") -> t.Dict[str, t.Any]: - packages = [ - "SMARTSIM", - "SMARTREDIS", - db_name, - "REDISAI", - "TORCH", - "TENSORFLOW", - ] - versions = [ - self.SMARTSIM, - self.SMARTREDIS, - self.REDIS, - self.REDISAI, - self.TORCH, - self.TENSORFLOW, - ] - if self.ONNX: - packages.append("ONNX") - versions.append(self.ONNX) - vers = {"Packages": packages, "Versions": versions} - return vers + ONNX = Version_(REDISAI.onnx) + + def as_dict(self, db_name: DbEngine = "REDIS") -> t.Dict[str, t.Tuple[str, ...]]: + pkg_map = { + "SMARTSIM": self.SMARTSIM, + "SMARTREDIS": self.SMARTREDIS, + db_name: self.REDIS, + "REDISAI": self.REDISAI, + "TORCH": self.TORCH, + "TENSORFLOW": self.TENSORFLOW, + "ONNX": self.ONNX, + } + return {"Packages": tuple(pkg_map), "Versions": tuple(pkg_map.values())} def ml_extras_required(self) -> t.Dict[str, t.List[str]]: """Optional ML/DL dependencies we suggest for the user. diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index f96a9bb5fd..c098cfd010 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -24,25 +24,39 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# pylint: disable=too-many-lines + +import concurrent.futures +import enum +import itertools import os +import platform import re import shutil import stat import subprocess import sys +import tarfile +import tempfile import typing as t +import urllib.request +import zipfile +from abc import ABC, abstractmethod +from dataclasses import dataclass from pathlib import Path from shutil import which from subprocess import SubprocessError -# NOTE: This will be imported by setup.py and hence no -# smartsim related items should be imported into -# this file. +# NOTE: This will be imported by setup.py and hence no smartsim related +# items should be imported into this file. -# TODO: -# - check cmake version and use system if possible to avoid conflicts +# TODO: check cmake version and use system if possible to avoid conflicts TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime", "tflite"] +TDeviceStr = t.Literal["cpu", "gpu"] + +_T = t.TypeVar("_T") +_U = t.TypeVar("_U") def expand_exe_path(exe: str) -> str: @@ -69,6 +83,37 @@ class BuildError(Exception): pass +class Architecture(enum.Enum): + X64 = ("x86_64", "amd64") + ARM64 = ("arm64",) + + @classmethod + def from_str(cls, string: str, /) -> "Architecture": + string = string.lower() + for type_ in cls: + if string in type_.value: + return type_ + raise BuildError(f"Unrecognized or unsupported architecture: {string}") + + +class OperatingSystem(enum.Enum): + LINUX = ("linux", "linux2") + DARWIN = ("darwin",) + + @classmethod + def from_str(cls, string: str, /) -> "OperatingSystem": + string = string.lower() + for type_ in cls: + if string in type_.value: + return type_ + raise BuildError(f"Unrecognized or unsupported operating system: {string}") + + +class Platform(t.NamedTuple): + os: OperatingSystem + architecture: Architecture + + class Builder: """Base class for building third-party libraries""" @@ -83,10 +128,16 @@ class Builder: ) def __init__( - self, env: t.Dict[str, t.Any], jobs: t.Optional[int] = 1, verbose: bool = False + self, + env: t.Dict[str, str], + jobs: int = 1, + _os: OperatingSystem = OperatingSystem.from_str(platform.system()), + architecture: Architecture = Architecture.from_str(platform.machine()), + verbose: bool = False, ) -> None: # build environment from buildenv self.env = env + self._platform = Platform(_os, architecture) # Find _core directory and set up paths _core_dir = Path(os.path.abspath(__file__)).parent.parent @@ -99,12 +150,7 @@ def __init__( self.bin_path = dependency_path / "bin" self.lib_path = dependency_path / "lib" - - # Set wether build process will output to std output - self.out: t.Optional[int] = subprocess.DEVNULL self.verbose = verbose - if self.verbose: - self.out = None # make build directory "SmartSim/smartsim/_core/.third-party" if not self.build_dir.is_dir(): @@ -117,12 +163,18 @@ def __init__( self.jobs = jobs + @property + def out(self) -> t.Optional[int]: + return None if self.verbose else subprocess.DEVNULL + # implemented in base classes @property def is_built(self) -> bool: raise NotImplementedError - def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None: + def build_from_git( + self, git_url: str, branch: str, device: TDeviceStr = "cpu" + ) -> None: raise NotImplementedError @staticmethod @@ -197,12 +249,20 @@ class DatabaseBuilder(Builder): def __init__( self, - build_env: t.Optional[t.Dict[str, t.Any]] = None, + build_env: t.Optional[t.Dict[str, str]] = None, malloc: str = "libc", - jobs: t.Optional[int] = None, + jobs: int = 1, + _os: OperatingSystem = OperatingSystem.from_str(platform.system()), + architecture: Architecture = Architecture.from_str(platform.machine()), verbose: bool = False, ) -> None: - super().__init__(build_env or {}, jobs=jobs, verbose=verbose) + super().__init__( + build_env or {}, + jobs=jobs, + _os=_os, + architecture=architecture, + verbose=verbose, + ) self.malloc = malloc @property @@ -213,7 +273,9 @@ def is_built(self) -> bool: keydb_files = {"keydb-server", "keydb-cli"} return redis_files.issubset(bin_files) or keydb_files.issubset(bin_files) - def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None: + def build_from_git( + self, git_url: str, branch: str, device: TDeviceStr = "cpu" + ) -> None: """Build Redis from git :param git_url: url from which to retrieve Redis :type git_url: str @@ -237,17 +299,21 @@ def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None if not self.is_valid_url(git_url): raise BuildError(f"Malformed {database_name} URL: {git_url}") + clone_cmd = config_git_command( + self._platform, + [ + self.binary_path("git"), + "clone", + git_url, + "--branch", + branch, + "--depth", + "1", + database_name, + ], + ) + # clone Redis - clone_cmd = [ - self.binary_path("git"), - "clone", - git_url, - "--branch", - branch, - "--depth", - "1", - database_name, - ] self.run_command(clone_cmd, cwd=self.build_dir) # build Redis @@ -288,6 +354,37 @@ def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None raise BuildError("Installation of redis-cli failed!") from e +class _RAIBuildDependency(ABC): + """An interface with a collection of magic methods so that + ``RedisAIBuilder`` can fetch and place its own dependencies + """ + + @property + @abstractmethod + def __rai_dependency_name__(self) -> str: ... + + @abstractmethod + def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: ... + + @staticmethod + @abstractmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: ... + + +def _place_rai_dep_at( + target: t.Union[str, "os.PathLike[str]"], verbose: bool +) -> t.Callable[[_RAIBuildDependency], Path]: + def _place(dep: _RAIBuildDependency) -> Path: + if verbose: + print(f"Placing: '{dep.__rai_dependency_name__}'") + path = dep.__place_for_rai__(target) + if verbose: + print(f"Placed: '{dep.__rai_dependency_name__}' at '{path}'") + return path + + return _place + + class RedisAIBuilder(Builder): """Class to build RedisAI from Source Supported build method: @@ -298,16 +395,25 @@ class RedisAIBuilder(Builder): def __init__( self, - build_env: t.Optional[t.Dict[str, t.Any]] = None, + _os: OperatingSystem = OperatingSystem.from_str(platform.system()), + architecture: Architecture = Architecture.from_str(platform.machine()), + build_env: t.Optional[t.Dict[str, str]] = None, torch_dir: str = "", libtf_dir: str = "", build_torch: bool = True, build_tf: bool = True, build_onnx: bool = False, - jobs: t.Optional[int] = None, + jobs: int = 1, verbose: bool = False, ) -> None: - super().__init__(build_env or {}, jobs=jobs, verbose=verbose) + super().__init__( + build_env or {}, + jobs=jobs, + _os=_os, + architecture=architecture, + verbose=verbose, + ) + self.rai_install_path: t.Optional[Path] = None # convert to int for RAI build script @@ -317,6 +423,29 @@ def __init__( self.libtf_dir = libtf_dir self.torch_dir = torch_dir + # Sanity checks + self._validate_platform() + + def _validate_platform(self) -> None: + unsupported = [] + if self._platform not in _DLPackRepository.supported_platforms(): + unsupported.append("DLPack") + if self.fetch_tf and (self._platform not in _TFArchive.supported_platforms()): + unsupported.append("Tensorflow") + if self.fetch_onnx and ( + self._platform not in _ORTArchive.supported_platforms() + ): + unsupported.append("ONNX") + if self.fetch_torch and ( + self._platform not in _PTArchive.supported_platforms() + ): + unsupported.append("PyTorch") + if unsupported: + raise BuildError( + f"The {', '.join(unsupported)} backend(s) are not supported " + f"on {self._platform.os} with {self._platform.architecture}" + ) + @property def rai_build_path(self) -> Path: return Path(self.build_dir, "RedisAI") @@ -351,6 +480,47 @@ def build_onnx(self) -> bool: def fetch_onnx(self) -> bool: return self.build_onnx + def get_deps_dir_path_for(self, device: TDeviceStr) -> Path: + def fail_to_format(reason: str) -> BuildError: # pragma: no cover + return BuildError(f"Failed to format RedisAI dependency path: {reason}") + + _os, architecture = self._platform + if _os == OperatingSystem.DARWIN: + os_ = "macos" + elif _os == OperatingSystem.LINUX: + os_ = "linux" + else: # pragma: no cover + raise fail_to_format(f"Unknown operating system: {_os}") + if architecture == Architecture.X64: + arch = "x64" + elif architecture == Architecture.ARM64: + arch = "arm64v8" + else: # pragma: no cover + raise fail_to_format(f"Unknown architecture: {architecture}") + return self.rai_build_path / f"deps/{os_}-{arch}-{device}" + + def _get_deps_to_fetch_for( + self, device: TDeviceStr + ) -> t.Tuple[_RAIBuildDependency, ...]: + os_, arch = self._platform + # TODO: It would be nice if the backend version numbers were declared + # alongside the python package version numbers so that all of the + # dependency versions were declared in single location. + # Unfortunately importing into this module is non-trivial as it + # is used as script in the SmartSim `setup.py`. + + # DLPack is always required + fetchable_deps: t.List[_RAIBuildDependency] = [_DLPackRepository("v0.5_RAI")] + if self.fetch_torch: + pt_dep = _choose_pt_variant(os_) + fetchable_deps.append(pt_dep(arch, device, "2.0.1")) + if self.fetch_tf: + fetchable_deps.append(_TFArchive(os_, arch, device, "2.13.1")) + if self.fetch_onnx: + fetchable_deps.append(_ORTArchive(os_, device, "1.16.3")) + + return tuple(fetchable_deps) + def symlink_libtf(self, device: str) -> None: """Add symbolic link to available libtensorflow in RedisAI deps. @@ -406,7 +576,9 @@ def symlink_libtf(self, device: str) -> None: if not dst_file.is_file(): os.symlink(src_file, dst_file) - def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None: + def build_from_git( + self, git_url: str, branch: str, device: TDeviceStr = "cpu" + ) -> None: """Build RedisAI from git :param git_url: url from which to retrieve RedisAI @@ -425,63 +597,24 @@ def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None raise BuildError(f"Malformed RedisAI URL: {git_url}") # clone RedisAI - clone_cmd = [ - self.binary_path("env"), - "GIT_LFS_SKIP_SMUDGE=1", - "git", - "clone", - "--recursive", - git_url, - ] - - checkout_osx_fix: t.List[str] = [] - - # Circumvent a bad `get_deps.sh` script from RAI on 1.2.7 with ONNX - # TODO: Look for a better way to do this or wait for RAI patch - if branch == "v1.2.7": - # Clone RAI patch commit for OSX - clone_cmd += ["RedisAI"] - checkout_osx_fix = [ + clone_cmd = config_git_command( + self._platform, + [ + self.binary_path("env"), + "GIT_LFS_SKIP_SMUDGE=1", "git", - "checkout", - "634916c722e718cc6ea3fad46e63f7d798f9adc2", - ] - else: - # Clone RAI release commit for versions > 1.2.7 - clone_cmd += [ + "clone", + "--recursive", + git_url, "--branch", branch, "--depth=1", - "RedisAI", - ] - - self.run_command(clone_cmd, out=subprocess.DEVNULL, cwd=self.build_dir) - if checkout_osx_fix: - self.run_command( - checkout_osx_fix, out=subprocess.DEVNULL, cwd=self.rai_build_path - ) - - # get RedisAI dependencies - dep_cmd = self._rai_build_env_prefix( - with_pt=self.build_torch, - with_tf=self.build_tf, - with_ort=self.build_onnx, - extra_env={"VERBOSE": "1"}, - ) - - dep_cmd.extend( - [ - self.binary_path("bash"), - str(self.rai_build_path / "get_deps.sh"), - str(device), - ] + os.fspath(self.rai_build_path), + ], ) - self.run_command( - dep_cmd, - out=subprocess.DEVNULL, # suppress this as it's not useful - cwd=self.rai_build_path, - ) + self.run_command(clone_cmd, out=subprocess.DEVNULL, cwd=self.build_dir) + self._fetch_deps_for(device) if self.libtf_dir and device: self.symlink_libtf(device) @@ -541,6 +674,25 @@ def _rai_build_env_prefix( *(f"{key}={val}" for key, val in extra_env.items()), ] + def _fetch_deps_for(self, device: TDeviceStr) -> None: + if not self.rai_build_path.is_dir(): + raise BuildError("RedisAI build directory not found") + + deps_dir = self.get_deps_dir_path_for(device) + deps_dir.mkdir(parents=True, exist_ok=True) + if any(deps_dir.iterdir()): + raise BuildError("RAI build dependency directory is not empty") + to_fetch = self._get_deps_to_fetch_for(device) + placed_paths = _threaded_map( + _place_rai_dep_at(deps_dir, self.verbose), to_fetch + ) + unique_placed_paths = {os.fspath(path.resolve()) for path in placed_paths} + if len(unique_placed_paths) != len(to_fetch): + raise BuildError( + f"Expected to place {len(to_fetch)} dependencies, but only " + f"found {len(unique_placed_paths)}" + ) + def _install_backends(self, device: str) -> None: """Move backend libraries to smartsim/_core/lib/ :param device: cpu or cpu @@ -578,3 +730,319 @@ def _move_torch_libs(self) -> None: if sys.platform == "darwin": dylibs = pip_torch_path / ".dylibs" self.copy_dir(dylibs, ss_rai_torch_path / ".dylibs", set_exe=True) + + +def _threaded_map(fn: t.Callable[[_T], _U], items: t.Iterable[_T]) -> t.Sequence[_U]: + items = tuple(items) + if not items: # No items so no work to do + return () + num_workers = min(len(items), (os.cpu_count() or 4) * 5) + with concurrent.futures.ThreadPoolExecutor(num_workers) as pool: + return tuple(pool.map(fn, items)) + + +class _WebLocation(ABC): + @property + @abstractmethod + def url(self) -> str: ... + + +class _WebGitRepository(_WebLocation): + def clone( + self, + target: t.Union[str, "os.PathLike[str]"], + depth: t.Optional[int] = None, + branch: t.Optional[str] = None, + ) -> None: + depth_ = ("--depth", str(depth)) if depth is not None else () + branch_ = ("--branch", branch) if branch is not None else () + _git("clone", "-q", *depth_, *branch_, self.url, os.fspath(target)) + + +@t.final +@dataclass(frozen=True) +class _DLPackRepository(_WebGitRepository, _RAIBuildDependency): + version: str + + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + return ( + (OperatingSystem.LINUX, Architecture.X64), + (OperatingSystem.DARWIN, Architecture.X64), + (OperatingSystem.DARWIN, Architecture.ARM64), + ) + + @property + def url(self) -> str: + return "https://github.com/RedisAI/dlpack.git" + + @property + def __rai_dependency_name__(self) -> str: + return f"dlpack@{self.url}" + + def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + target = Path(target) / "dlpack" + self.clone(target, branch=self.version, depth=1) + if not target.is_dir(): + raise BuildError("Failed to place dlpack") + return target + + +class _WebArchive(_WebLocation): + @property + def name(self) -> str: + _, name = self.url.rsplit("/", 1) + return name + + def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + target = Path(target) + if target.is_dir(): + target = target / self.name + file, _ = urllib.request.urlretrieve(self.url, target) + return Path(file).resolve() + + +class _ExtractableWebArchive(_WebArchive, ABC): + @abstractmethod + def _extract_download( + self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] + ) -> None: ... + + def extract(self, target: t.Union[str, "os.PathLike[str]"]) -> None: + with tempfile.TemporaryDirectory() as tmp_dir: + arch_path = self.download(tmp_dir) + self._extract_download(arch_path, target) + + +class _WebTGZ(_ExtractableWebArchive): + def _extract_download( + self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] + ) -> None: + with tarfile.open(download_path, "r") as tgz_file: + tgz_file.extractall(target) + + +class _WebZip(_ExtractableWebArchive): + def _extract_download( + self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] + ) -> None: + with zipfile.ZipFile(download_path, "r") as zip_file: + zip_file.extractall(target) + + +@dataclass(frozen=True) +class _PTArchive(_WebZip, _RAIBuildDependency): + architecture: Architecture + device: TDeviceStr + version: str + + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + # TODO: This will need to be revisited if the inheritance tree gets deeper + return tuple( + itertools.chain.from_iterable( + var.supported_platforms() for var in _PTArchive.__subclasses__() + ) + ) + + @property + def __rai_dependency_name__(self) -> str: + return f"libtorch@{self.url}" + + def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + self.extract(target) + target = Path(target) / "libtorch" + if not target.is_dir(): + raise BuildError("Failed to place RAI dependency: `libtorch`") + return target + + +@t.final +class _PTArchiveLinux(_PTArchive): + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + return ((OperatingSystem.LINUX, Architecture.X64),) + + @property + def url(self) -> str: + if self.device == "gpu": + pt_build = "cu117" + else: + pt_build = "cpu" + # pylint: disable-next=line-too-long + libtorch_archive = ( + f"libtorch-cxx11-abi-shared-without-deps-{self.version}%2B{pt_build}.zip" + ) + return f"https://download.pytorch.org/libtorch/{pt_build}/{libtorch_archive}" + + +@t.final +class _PTArchiveMacOSX(_PTArchive): + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + return ( + (OperatingSystem.DARWIN, Architecture.ARM64), + (OperatingSystem.DARWIN, Architecture.X64), + ) + + @property + def url(self) -> str: + if self.device == "gpu": + raise BuildError("RedisAI does not currently support GPU on Mac OSX") + if self.architecture == Architecture.X64: + pt_build = "cpu" + libtorch_archive = f"libtorch-macos-{self.version}.zip" + root_url = "https://download.pytorch.org/libtorch" + return f"{root_url}/{pt_build}/{libtorch_archive}" + if self.architecture == Architecture.ARM64: + libtorch_archive = f"libtorch-macos-arm64-{self.version}.zip" + # pylint: disable-next=line-too-long + root_url = ( + "https://github.com/CrayLabs/ml_lib_builder/releases/download/v0.1/" + ) + return f"{root_url}/{libtorch_archive}" + + raise BuildError("Unsupported architecture for Pytorch: {self.architecture}") + + +def _choose_pt_variant( + os_: OperatingSystem, +) -> t.Union[t.Type[_PTArchiveLinux], t.Type[_PTArchiveMacOSX]]: + if os_ == OperatingSystem.DARWIN: + return _PTArchiveMacOSX + if os_ == OperatingSystem.LINUX: + return _PTArchiveLinux + + raise BuildError(f"Unsupported OS for PyTorch: {os_}") + + +@t.final +@dataclass(frozen=True) +class _TFArchive(_WebTGZ, _RAIBuildDependency): + os_: OperatingSystem + architecture: Architecture + device: TDeviceStr + version: str + + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + return ( + (OperatingSystem.LINUX, Architecture.X64), + (OperatingSystem.DARWIN, Architecture.X64), + ) + + @property + def url(self) -> str: + if self.architecture == Architecture.X64: + tf_arch = "x86_64" + else: + raise BuildError( + "Unexpected Architecture for TF Archive: {self.architecture}" + ) + + if self.os_ == OperatingSystem.LINUX: + tf_os = "linux" + tf_device = self.device + elif self.os_ == OperatingSystem.DARWIN: + tf_os = "darwin" + if self.device == "gpu": + raise BuildError("RedisAI does not currently support GPU on Macos") + tf_device = "cpu" + else: + raise BuildError("Unexpected OS for TF Archive: {self.os_}") + return ( + "https://storage.googleapis.com/tensorflow/libtensorflow/" + f"libtensorflow-{tf_device}-{tf_os}-{tf_arch}-{self.version}.tar.gz" + ) + + @property + def __rai_dependency_name__(self) -> str: + return f"libtensorflow@{self.url}" + + def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + target = Path(target) / "libtensorflow" + target.mkdir() + self.extract(target) + return target + + +@t.final +@dataclass(frozen=True) +class _ORTArchive(_WebTGZ, _RAIBuildDependency): + os_: OperatingSystem + device: TDeviceStr + version: str + + @staticmethod + def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: + return ( + (OperatingSystem.LINUX, Architecture.X64), + (OperatingSystem.DARWIN, Architecture.X64), + ) + + @property + def url(self) -> str: + ort_url_base = ( + "https://github.com/microsoft/onnxruntime/releases/" + f"download/v{self.version}" + ) + if self.os_ == OperatingSystem.LINUX: + ort_os = "linux" + ort_arch = "x64" + ort_build = "-gpu" if self.device == "gpu" else "" + elif self.os_ == OperatingSystem.DARWIN: + ort_os = "osx" + ort_arch = "x86_64" + ort_build = "" + if self.device == "gpu": + raise BuildError("RedisAI does not currently support GPU on Macos") + else: + raise BuildError("Unexpected OS for TF Archive: {self.os_}") + ort_archive = f"onnxruntime-{ort_os}-{ort_arch}{ort_build}-{self.version}.tgz" + return f"{ort_url_base}/{ort_archive}" + + @property + def __rai_dependency_name__(self) -> str: + return f"onnxruntime@{self.url}" + + def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + target = Path(target).resolve() / "onnxruntime" + self.extract(target) + try: + (extracted_dir,) = target.iterdir() + except ValueError: + raise BuildError( + "Unexpected number of files extracted from ORT archive" + ) from None + for file in extracted_dir.iterdir(): + file.rename(target / file.name) + extracted_dir.rmdir() + return target + + +def _git(*args: str) -> None: + git = Builder.binary_path("git") + cmd = (git,) + args + with subprocess.Popen(cmd) as proc: + proc.wait() + if proc.returncode != 0: + raise BuildError( + f"Command `{' '.join(cmd)}` failed with exit code {proc.returncode}" + ) + + +def config_git_command(plat: Platform, cmd: t.Sequence[str]) -> t.List[str]: + """Modify git commands to include autocrlf when on a platform that needs + autocrlf enabled to behave correctly + """ + cmd = list(cmd) + where = next((i for i, tok in enumerate(cmd) if tok.endswith("git")), len(cmd)) + 2 + if where >= len(cmd): + raise ValueError(f"Failed to locate git command in '{' '.join(cmd)}'") + if plat == Platform(OperatingSystem.DARWIN, Architecture.ARM64): + cmd = ( + cmd[:where] + + ["--config", "core.autocrlf=false", "--config", "core.eol=lf"] + + cmd[where:] + ) + return cmd diff --git a/smartsim/_core/config/__init__.py b/smartsim/_core/config/__init__.py index 97e3caf189..1637d6a2f3 100644 --- a/smartsim/_core/config/__init__.py +++ b/smartsim/_core/config/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index af5ebe5084..42a548c427 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -225,12 +225,16 @@ def telemetry_frequency(self) -> int: @property def telemetry_enabled(self) -> bool: - return int(os.environ.get("SMARTSIM_FLAG_TELEMETRY", "0")) > 0 + return int(os.environ.get("SMARTSIM_FLAG_TELEMETRY", "1")) > 0 @property def telemetry_cooldown(self) -> int: return int(os.environ.get("SMARTSIM_TELEMETRY_COOLDOWN", 90)) + @property + def telemetry_subdir(self) -> str: + return ".smartsim/telemetry" + @lru_cache(maxsize=128, typed=False) def get_config() -> Config: diff --git a/smartsim/_core/control/__init__.py b/smartsim/_core/control/__init__.py index 2a89c04b5c..0acd80650c 100644 --- a/smartsim/_core/control/__init__.py +++ b/smartsim/_core/control/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index e3e463c51e..3b673970a6 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -63,13 +63,7 @@ from ...servertype import CLUSTERED, STANDALONE from ...status import STATUS_CANCELLED, STATUS_RUNNING, TERMINAL_STATUSES from ..config import CONFIG -from ..launcher import ( - CobaltLauncher, - LocalLauncher, - LSFLauncher, - PBSLauncher, - SlurmLauncher, -) +from ..launcher import LocalLauncher, LSFLauncher, PBSLauncher, SlurmLauncher from ..launcher.launcher import Launcher from ..utils import check_cluster_status, create_cluster, serialize from .job import Job @@ -318,7 +312,7 @@ def get_entity_list_status( def init_launcher(self, launcher: str) -> None: """Initialize the controller with a specific type of launcher. - SmartSim currently supports slurm, pbs(pro), cobalt, lsf, + SmartSim currently supports slurm, pbs(pro), lsf, and local launching :param launcher: which launcher to initialize @@ -331,7 +325,6 @@ def init_launcher(self, launcher: str) -> None: "slurm": SlurmLauncher, "pbs": PBSLauncher, "pals": PBSLauncher, - "cobalt": CobaltLauncher, "lsf": LSFLauncher, "local": LocalLauncher, } @@ -631,7 +624,7 @@ def _prep_entity_client_env(self, entity: Model) -> None: # Set address to local if it's a colocated model if entity.colocated and entity.run_settings.colocated_db_settings is not None: db_name_colo = entity.run_settings.colocated_db_settings["db_identifier"] - + assert isinstance(db_name_colo, str) for key in address_dict: _, db_id = unpack_db_identifier(key, "_") if db_name_colo == db_id: @@ -842,11 +835,11 @@ def _start_telemetry_monitor(self, exp_dir: str) -> None: :param exp_dir: An experiment directory :type exp_dir: str """ - logger.debug("Starting telemetry monitor process") if ( self._telemetry_monitor is None or self._telemetry_monitor.returncode is not None ): + logger.debug("Starting telemetry monitor process") cmd = [ sys.executable, "-m", @@ -866,6 +859,7 @@ def _start_telemetry_monitor(self, exp_dir: str) -> None: cwd=str(pathlib.Path(__file__).parent.parent.parent), shell=False, ) + logger.debug("Telemetry monitor started") class _AnonymousBatchJob(EntityList[Model]): diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index aa4ecce76c..f3bd8cf3aa 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index d23543030f..e482b9951a 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +34,7 @@ from ...database import Orchestrator from ...entity import DBNode, EntitySequence, SmartSimEntity -from ...log import get_logger +from ...log import ContextThread, get_logger from ...status import STATUS_NEVER_STARTED, TERMINAL_STATUSES from ..config import CONFIG from ..launcher import Launcher, LocalLauncher @@ -80,7 +80,7 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None: def start(self) -> None: """Start a thread for the job manager""" - self.monitor = Thread(name="JobManager", daemon=True, target=self.run) + self.monitor = ContextThread(name="JobManager", daemon=True, target=self.run) self.monitor.start() def run(self) -> None: diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 62ab013e58..25037540c1 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import itertools import pathlib import typing as t from dataclasses import dataclass, field @@ -31,6 +32,7 @@ from ...database import Orchestrator from ...entity import DBNode, Ensemble, EntitySequence, Model, SmartSimEntity from ...error import SmartSimError +from ..config import CONFIG from ..utils import helpers as _helpers from ..utils import serialize as _serialize @@ -177,52 +179,12 @@ def __str__(self) -> str: @property def has_db_objects(self) -> bool: """Check if any entity has DBObjects to set""" - - def has_db_models( - entity: t.Union[EntitySequence[SmartSimEntity], Model] - ) -> bool: - return len(list(entity.db_models)) > 0 - - def has_db_scripts( - entity: t.Union[EntitySequence[SmartSimEntity], Model] - ) -> bool: - return len(list(entity.db_scripts)) > 0 - - has_db_objects = False - - # Check if any model has either a DBModel or a DBScript - # we update has_db_objects so that as soon as one check - # returns True, we can exit - has_db_objects |= any( - has_db_models(model) | has_db_scripts(model) for model in self.models + ents: t.Iterable[t.Union[Model, Ensemble]] = itertools.chain( + self.models, + self.ensembles, + (member for ens in self.ensembles for member in ens.entities), ) - if has_db_objects: - return True - - # If there are no ensembles, there can be no outstanding model - # to check for DBObjects, return current value of DBObjects, which - # should be False - ensembles = self.ensembles - if not ensembles: - return has_db_objects - - # First check if there is any ensemble DBObject, if so, return True - has_db_objects |= any( - has_db_models(ensemble) | has_db_scripts(ensemble) for ensemble in ensembles - ) - if has_db_objects: - return True - for ensemble in ensembles: - # Last case, check if any model within an ensemble has DBObjects attached - has_db_objects |= any( - has_db_models(model) | has_db_scripts(model) - for model in ensemble.models - ) - if has_db_objects: - return True - - # `has_db_objects` should be False here - return has_db_objects + return any(any(ent.db_models) or any(ent.db_scripts) for ent in ents) class _LaunchedManifestMetadata(t.NamedTuple): @@ -343,7 +305,7 @@ def finalize(self) -> LaunchedManifest[_T]: def _format_exp_telemetry_path( exp_path: t.Union[str, "os.PathLike[str]"] ) -> pathlib.Path: - return pathlib.Path(exp_path, _serialize.TELMON_SUBDIR) + return pathlib.Path(exp_path, CONFIG.telemetry_subdir) def _format_run_telemetry_path( diff --git a/smartsim/_core/entrypoints/__init__.py b/smartsim/_core/entrypoints/__init__.py index bf6fd954c7..efe03908e0 100644 --- a/smartsim/_core/entrypoints/__init__.py +++ b/smartsim/_core/entrypoints/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index 332d6e0198..600ae2ff31 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/entrypoints/indirect.py b/smartsim/_core/entrypoints/indirect.py index 18d27601ff..f94ad6e61b 100644 --- a/smartsim/_core/entrypoints/indirect.py +++ b/smartsim/_core/entrypoints/indirect.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index ef9911829d..018fc26fd4 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py index 86d6fe72fb..115528bf42 100644 --- a/smartsim/_core/entrypoints/telemetrymonitor.py +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -49,7 +49,6 @@ from smartsim._core.config import CONFIG from smartsim._core.control.job import JobEntity, _JobKey from smartsim._core.control.jobmanager import JobManager -from smartsim._core.launcher.cobalt.cobaltLauncher import CobaltLauncher from smartsim._core.launcher.launcher import Launcher from smartsim._core.launcher.local.local import LocalLauncher from smartsim._core.launcher.lsf.lsfLauncher import LSFLauncher @@ -57,7 +56,7 @@ from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher from smartsim._core.launcher.stepInfo import StepInfo from smartsim._core.utils.helpers import get_ts -from smartsim._core.utils.serialize import MANIFEST_FILENAME, TELMON_SUBDIR +from smartsim._core.utils.serialize import MANIFEST_FILENAME from smartsim.error.errors import SmartSimError from smartsim.status import STATUS_COMPLETED, TERMINAL_STATUSES @@ -325,14 +324,13 @@ def __init__( self._launcher_map: t.Dict[str, t.Type[Launcher]] = { "slurm": SlurmLauncher, "pbs": PBSLauncher, - "cobalt": CobaltLauncher, "lsf": LSFLauncher, "local": LocalLauncher, } def init_launcher(self, launcher: str) -> Launcher: """Initialize the controller with a specific type of launcher. - SmartSim currently supports slurm, pbs(pro), cobalt, lsf, + SmartSim currently supports slurm, pbs(pro), lsf, and local launching :param launcher: which launcher to initialize @@ -582,7 +580,7 @@ def main( poll for new jobs before attempting to shutdown :type cooldown_duration: int """ - manifest_relpath = pathlib.Path(TELMON_SUBDIR) / MANIFEST_FILENAME + manifest_relpath = pathlib.Path(CONFIG.telemetry_subdir) / MANIFEST_FILENAME manifest_path = experiment_dir / manifest_relpath monitor_pattern = str(manifest_relpath) @@ -667,7 +665,9 @@ def get_parser() -> argparse.ArgumentParser: log.setLevel(logging.DEBUG) log.propagate = False - log_path = os.path.join(args.exp_dir, TELMON_SUBDIR, "telemetrymonitor.log") + log_path = os.path.join( + args.exp_dir, CONFIG.telemetry_subdir, "telemetrymonitor.log" + ) fh = logging.FileHandler(log_path, "a") log.addHandler(fh) diff --git a/smartsim/_core/generation/__init__.py b/smartsim/_core/generation/__init__.py index 10470e2d5e..5224f84983 100644 --- a/smartsim/_core/generation/__init__.py +++ b/smartsim/_core/generation/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/generation/generator.py b/smartsim/_core/generation/generator.py index 79cea06b7b..502753df7f 100644 --- a/smartsim/_core/generation/generator.py +++ b/smartsim/_core/generation/generator.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/generation/modelwriter.py b/smartsim/_core/generation/modelwriter.py index 0cf071082e..3062ea1db5 100644 --- a/smartsim/_core/generation/modelwriter.py +++ b/smartsim/_core/generation/modelwriter.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/__init__.py b/smartsim/_core/launcher/__init__.py index 6e1aa724e5..0c4001cd48 100644 --- a/smartsim/_core/launcher/__init__.py +++ b/smartsim/_core/launcher/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .cobalt.cobaltLauncher import CobaltLauncher from .launcher import Launcher from .local.local import LocalLauncher from .lsf.lsfLauncher import LSFLauncher @@ -33,7 +32,6 @@ __all__ = [ "Launcher", - "CobaltLauncher", "LocalLauncher", "LSFLauncher", "PBSLauncher", diff --git a/smartsim/_core/launcher/cobalt/__init__.py b/smartsim/_core/launcher/cobalt/__init__.py deleted file mode 100644 index bf6fd954c7..0000000000 --- a/smartsim/_core/launcher/cobalt/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/launcher/cobalt/cobaltCommands.py b/smartsim/_core/launcher/cobalt/cobaltCommands.py deleted file mode 100644 index bf6fd954c7..0000000000 --- a/smartsim/_core/launcher/cobalt/cobaltCommands.py +++ /dev/null @@ -1,25 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/launcher/cobalt/cobaltLauncher.py b/smartsim/_core/launcher/cobalt/cobaltLauncher.py deleted file mode 100644 index 56ebe12cc2..0000000000 --- a/smartsim/_core/launcher/cobalt/cobaltLauncher.py +++ /dev/null @@ -1,207 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import time -import typing as t - -import psutil - -from smartsim._core.launcher.step import Step -from smartsim.settings import ( - AprunSettings, - CobaltBatchSettings, - MpiexecSettings, - MpirunSettings, - OrterunSettings, - RunSettings, - SettingsBase, -) - -from ....error import LauncherError -from ....log import get_logger -from ....status import STATUS_CANCELLED, STATUS_COMPLETED -from ...config import CONFIG -from ..launcher import WLMLauncher -from ..pbs.pbsCommands import qdel, qstat -from ..step import ( - AprunStep, - CobaltBatchStep, - LocalStep, - MpiexecStep, - MpirunStep, - OrterunStep, - Step, -) -from ..stepInfo import CobaltStepInfo, StepInfo -from .cobaltParser import parse_cobalt_step_id, parse_cobalt_step_status, parse_qsub_out - -logger = get_logger(__name__) - - -class CobaltLauncher(WLMLauncher): - """This class encapsulates the functionality needed - to launch jobs on systems that use Cobalt as a workload manager. - - All WLM launchers are capable of launching managed and unmanaged - jobs. Managed jobs are queried through interaction with with WLM, - in this case Cobalt. Unmanaged jobs are held in the TaskManager - and are managed through references to their launching process ID - i.e. a psutil.Popen object - """ - - def __init__(self) -> None: - super().__init__() - self.user = psutil.Process().username() - - @property - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: - # RunSettings types supported by this launcher - return { - AprunSettings: AprunStep, - CobaltBatchSettings: CobaltBatchStep, - MpirunSettings: MpirunStep, - MpiexecSettings: MpiexecStep, - OrterunSettings: OrterunStep, - RunSettings: LocalStep, - } - - def run(self, step: Step) -> t.Optional[str]: - """Run a job step through Cobalt - - :param step: a job step instance - :type step: Step - :raises LauncherError: if launch fails - :return: job step id if job is managed - :rtype: str - """ - if not self.task_manager.actively_monitoring: - self.task_manager.start() - - cmd_list = step.get_launch_cmd() - step_id = None - task_id = None - if isinstance(step, CobaltBatchStep): - # wait for batch step to submit successfully - return_code, out, err = self.task_manager.start_and_wait(cmd_list, step.cwd) - if return_code != 0: - raise LauncherError( - f"Cobalt qsub batch submission failed\n {out}\n {err}" - ) - if out: - step_id = parse_qsub_out(out) - logger.debug(f"Gleaned batch job id: {step_id} for {step.name}") - else: - # aprun doesn't direct output for us. - out, err = step.get_output_files() - - # pylint: disable-next=consider-using-with - output = open(out, "w+", encoding="utf-8") - # pylint: disable-next=consider-using-with - error = open(err, "w+", encoding="utf-8") - - task_id = self.task_manager.start_task( - cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() - ) - - # if batch submission did not successfully retrieve job ID - if not step_id and step.managed: - step_id = self._get_cobalt_step_id(step) - - self.step_mapping.add(step.name, step_id, task_id, step.managed) - return step_id - - def stop(self, step_name: str) -> StepInfo: - """Step a job step - - :param step_name: name of the job to stop - :type step_name: str - :return: update for job due to cancel - :rtype: StepInfo - """ - stepmap = self.step_mapping[step_name] - if stepmap.managed: - qdel_rc, _, err = qdel([str(stepmap.step_id)]) - if qdel_rc != 0: - logger.warning(f"Unable to cancel job step {step_name}\n {err}") - if stepmap.task_id: - self.task_manager.remove_task(str(stepmap.task_id)) - else: - if stepmap.task_id: - self.task_manager.remove_task(str(stepmap.task_id)) - - _, step_info = self.get_step_update([step_name])[0] - if not step_info: - raise LauncherError(f"Could not get step_info for job step {step_name}") - step_info.status = STATUS_CANCELLED # set status to cancelled instead of failed - return step_info - - def _get_cobalt_step_id(self, step: Step, interval: int = 2) -> str: - """Get the step_id of a step from qstat (rarely used) - - Parses cobalt qstat output by looking for the step name - """ - step_id = None - trials = CONFIG.wlm_trials - while trials > 0: - output, _ = qstat(["--header", "JobName:JobId", "-u", self.user]) - step_id = parse_cobalt_step_id(output, step.name) - if step_id: - break - else: - time.sleep(interval) - trials -= 1 - if not step_id: - raise LauncherError("Could not find id of launched job step") - return step_id - - def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: - """Get step updates for WLM managed jobs - - :param step_ids: list of job step ids - :type step_ids: list[str] - :return: list of updates for managed jobs - :rtype: list[StepInfo] - """ - args = ["--header", "JobId:State", "-u", self.user] - args.extend(step_ids) - qstat_out, _ = qstat(args) - - stats = [ - parse_cobalt_step_status(qstat_out, str(step_id)) for step_id in step_ids - ] - # create CobaltStepInfo objects to return - updates: t.List[StepInfo] = [] - for stat, _ in zip(stats, step_ids): - info = CobaltStepInfo(stat, None) # returncode not logged by Cobalt - - if info.status == STATUS_COMPLETED: - info.returncode = 0 - - updates.append(info) - return updates - - def __str__(self) -> str: - return "Cobalt" diff --git a/smartsim/_core/launcher/cobalt/cobaltParser.py b/smartsim/_core/launcher/cobalt/cobaltParser.py deleted file mode 100644 index c76509d369..0000000000 --- a/smartsim/_core/launcher/cobalt/cobaltParser.py +++ /dev/null @@ -1,86 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -def parse_cobalt_step_status(output: str, step_id: str) -> str: - """ - Parse and return the status of a cobalt step - - :param output: output qstat - :type output: str - :param step_id: the id of the step to query - :type step_id: str - :rtype: str - """ - status = "NOTFOUND" - for line in output.split("\n"): - fields = line.split() - if len(fields) >= 2: - if fields[0] == step_id: - status = fields[1] - break - return status - - -def parse_cobalt_step_id(output: str, step_name: str) -> str: - """Parse and return the step id from a cobalt qstat command - - :param output: output qstat - :type output: str - :param step_name: the name of the step to query - :type step_name: str - :return: the step_id - :rtype: str - """ - step_id = "" - for line in output.split("\n"): - fields = line.split() - if len(fields) >= 2: - if fields[0] == step_name: - step_id = fields[1] - break - return step_id - - -def parse_qsub_out(output: str) -> str: - """ - Parse and return the step id from a cobalt qsub command - - :param output: output qstat - :type output: str - :return: the step_id - :rtype: str - """ - step_id = "" - for line in output.split("\n"): - try: - value = line.strip() - int(value) # if the cast works, return original string - step_id = value - break - except ValueError: - continue - return step_id diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index ea331023c8..11d26b141e 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index 61f0460f96..80000c22f9 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/local/__init__.py b/smartsim/_core/launcher/local/__init__.py index bf6fd954c7..efe03908e0 100644 --- a/smartsim/_core/launcher/local/__init__.py +++ b/smartsim/_core/launcher/local/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py index fee058d166..96778ec0dd 100644 --- a/smartsim/_core/launcher/local/local.py +++ b/smartsim/_core/launcher/local/local.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/lsf/__init__.py b/smartsim/_core/launcher/lsf/__init__.py index bf6fd954c7..efe03908e0 100644 --- a/smartsim/_core/launcher/lsf/__init__.py +++ b/smartsim/_core/launcher/lsf/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/lsf/lsfCommands.py b/smartsim/_core/launcher/lsf/lsfCommands.py index 99836fa7af..d6d0ee031a 100644 --- a/smartsim/_core/launcher/lsf/lsfCommands.py +++ b/smartsim/_core/launcher/lsf/lsfCommands.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/lsf/lsfLauncher.py b/smartsim/_core/launcher/lsf/lsfLauncher.py index c2f432807d..a8b6fafdbd 100644 --- a/smartsim/_core/launcher/lsf/lsfLauncher.py +++ b/smartsim/_core/launcher/lsf/lsfLauncher.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/lsf/lsfParser.py b/smartsim/_core/launcher/lsf/lsfParser.py index fff49c57ef..33837d2bda 100644 --- a/smartsim/_core/launcher/lsf/lsfParser.py +++ b/smartsim/_core/launcher/lsf/lsfParser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/pbs/__init__.py b/smartsim/_core/launcher/pbs/__init__.py index bf6fd954c7..efe03908e0 100644 --- a/smartsim/_core/launcher/pbs/__init__.py +++ b/smartsim/_core/launcher/pbs/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/pbs/pbsCommands.py b/smartsim/_core/launcher/pbs/pbsCommands.py index 0fdd06f7b3..f738ef1f85 100644 --- a/smartsim/_core/launcher/pbs/pbsCommands.py +++ b/smartsim/_core/launcher/pbs/pbsCommands.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index 1b77ffd811..0b2f85e951 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/pbs/pbsParser.py b/smartsim/_core/launcher/pbs/pbsParser.py index 426166342e..362577595c 100644 --- a/smartsim/_core/launcher/pbs/pbsParser.py +++ b/smartsim/_core/launcher/pbs/pbsParser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/slurm/__init__.py b/smartsim/_core/launcher/slurm/__init__.py index bf6fd954c7..efe03908e0 100644 --- a/smartsim/_core/launcher/slurm/__init__.py +++ b/smartsim/_core/launcher/slurm/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/slurm/slurmCommands.py b/smartsim/_core/launcher/slurm/slurmCommands.py index ecf545b918..2e37f1d79e 100644 --- a/smartsim/_core/launcher/slurm/slurmCommands.py +++ b/smartsim/_core/launcher/slurm/slurmCommands.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index cba8df4f17..e939a63db7 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/slurm/slurmParser.py b/smartsim/_core/launcher/slurm/slurmParser.py index bfea06efb5..ede687eb63 100644 --- a/smartsim/_core/launcher/slurm/slurmParser.py +++ b/smartsim/_core/launcher/slurm/slurmParser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/step/__init__.py b/smartsim/_core/launcher/step/__init__.py index 98dd1a9215..663edb6827 100644 --- a/smartsim/_core/launcher/step/__init__.py +++ b/smartsim/_core/launcher/step/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -25,7 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from .alpsStep import AprunStep -from .cobaltStep import CobaltBatchStep from .localStep import LocalStep from .lsfStep import BsubBatchStep, JsrunStep from .mpiStep import MpiexecStep, MpirunStep, OrterunStep diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index d675f703f1..61ca5eee8d 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -113,12 +113,6 @@ def _set_alloc(self) -> None: logger.debug( f"Running on PBS allocation {self.alloc} gleaned from user environment" ) - elif "COBALT_JOBID" in os.environ: - self.alloc = os.environ["COBALT_JOBID"] - logger.debug( - f"Running on Cobalt allocation {self.alloc} gleaned " - "from user environment" - ) else: raise AllocationError( "No allocation specified or found and not running in batch" diff --git a/smartsim/_core/launcher/step/cobaltStep.py b/smartsim/_core/launcher/step/cobaltStep.py deleted file mode 100644 index b224121e25..0000000000 --- a/smartsim/_core/launcher/step/cobaltStep.py +++ /dev/null @@ -1,106 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import os -import stat -import typing as t - -from ....log import get_logger -from ....settings import CobaltBatchSettings -from .step import Step - -logger = get_logger(__name__) - - -class CobaltBatchStep(Step): - def __init__( - self, name: str, cwd: str, batch_settings: CobaltBatchSettings - ) -> None: - """Initialize a Cobalt qsub step - - :param name: name of the entity to launch - :type name: str - :param cwd: path to launch dir - :type cwd: str - :param batch_settings: batch settings for entity - :type batch_settings: CobaltBatchSettings - """ - super().__init__(name, cwd, batch_settings) - self.step_cmds: t.List[t.List[str]] = [] - self.managed = True - self.batch_settings = batch_settings - - def get_launch_cmd(self) -> t.List[str]: - """Get the launch command for the batch - - :return: launch command for the batch - :rtype: list[str] - """ - script = self._write_script() - return [self.batch_settings.batch_cmd, script] - - def add_to_batch(self, step: Step) -> None: - """Add a job step to this batch - - :param step: a job step instance e.g. SrunStep - :type step: Step - """ - launch_cmd = step.get_launch_cmd() - self.step_cmds.append(launch_cmd) - logger.debug(f"Added step command to batch for {step.name}") - - def _write_script(self) -> str: - """Write the batch script - - :return: batch script path after writing - :rtype: str - """ - batch_script = self.get_step_file(ending=".sh") - cobalt_debug = self.get_step_file(ending=".cobalt-debug") - output, error = self.get_output_files() - with open(batch_script, "w", encoding="utf-8") as script_file: - script_file.write("#!/bin/bash\n") - script_file.write(f"#COBALT -o {output}\n") - script_file.write(f"#COBALT -e {error}\n") - script_file.write(f"#COBALT --cwd {self.cwd}\n") - script_file.write(f"#COBALT --jobname {self.name}\n") - script_file.write(f"#COBALT --debuglog {cobalt_debug}\n") - - # add additional sbatch options - for opt in self.batch_settings.format_batch_args(): - script_file.write(f"#COBALT {opt}\n") - - for cmd in self.batch_settings.preamble: - script_file.write(f"{cmd}\n") - - for i, step_cmd in enumerate(self.step_cmds): - script_file.write("\n") - script_file.write(f"{' '.join((step_cmd))} &\n") - if i == len(self.step_cmds) - 1: - script_file.write("\n") - script_file.write("wait\n") - os.chmod(batch_script, stat.S_IXUSR | stat.S_IWUSR | stat.S_IRUSR) - return batch_script diff --git a/smartsim/_core/launcher/step/localStep.py b/smartsim/_core/launcher/step/localStep.py index 2f10bc79d5..968152a412 100644 --- a/smartsim/_core/launcher/step/localStep.py +++ b/smartsim/_core/launcher/step/localStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/step/lsfStep.py b/smartsim/_core/launcher/step/lsfStep.py index 953ab9c452..1c88dadb86 100644 --- a/smartsim/_core/launcher/step/lsfStep.py +++ b/smartsim/_core/launcher/step/lsfStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 7971fb7325..785d55e92b 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -57,7 +57,7 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings) -> None: self._set_alloc() self.run_settings = run_settings - _supported_launchers = ["PBS", "COBALT", "SLURM", "LSB"] + _supported_launchers = ["PBS", "SLURM", "LSB"] @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: diff --git a/smartsim/_core/launcher/step/pbsStep.py b/smartsim/_core/launcher/step/pbsStep.py index 9218894f9b..65dac3225c 100644 --- a/smartsim/_core/launcher/step/pbsStep.py +++ b/smartsim/_core/launcher/step/pbsStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index cb0db483bb..7baab891b1 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index ebbdd074e5..ddb95a8506 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -93,7 +93,7 @@ def get_colocated_launch_script(self) -> str: ) makedirs(osp.dirname(script_path), exist_ok=True) - db_settings: t.Dict[str, str] = {} + db_settings = {} if isinstance(self.step_settings, RunSettings): db_settings = self.step_settings.colocated_db_settings or {} diff --git a/smartsim/_core/launcher/stepInfo.py b/smartsim/_core/launcher/stepInfo.py index b33dac5ecf..56b5218fc8 100644 --- a/smartsim/_core/launcher/stepInfo.py +++ b/smartsim/_core/launcher/stepInfo.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -195,42 +195,6 @@ def __init__( ) -class CobaltStepInfo(StepInfo): # cov-cobalt - @property - def mapping(self) -> t.Dict[str, str]: - return { - "running": STATUS_RUNNING, - "queued": STATUS_PAUSED, - "starting": STATUS_PAUSED, - "dep_hold": STATUS_PAUSED, - "user_hold": STATUS_PAUSED, - "admin_hold": STATUS_PAUSED, - "dep_fail": STATUS_FAILED, # unsure of this one - "terminating": STATUS_COMPLETED, - "killing": STATUS_COMPLETED, - "exiting": STATUS_COMPLETED, - } - - def __init__( - self, - status: str = "", - returncode: t.Optional[int] = None, - output: t.Optional[str] = None, - error: t.Optional[str] = None, - ) -> None: - if status == "NOTFOUND": - # returncode not logged by Cobalt - # if job has exited the queue then we consider it "completed" - # this should only be hit in the case when job exits abnormally fast - smartsim_status = "Completed" - returncode = 0 - else: - smartsim_status = self._get_smartsim_status(status) - super().__init__( - smartsim_status, status, returncode, output=output, error=error - ) - - class LSFBatchStepInfo(StepInfo): # cov-lsf @property def mapping(self) -> t.Dict[str, str]: diff --git a/smartsim/_core/launcher/stepMapping.py b/smartsim/_core/launcher/stepMapping.py index 665404b1b0..15c93470f3 100644 --- a/smartsim/_core/launcher/stepMapping.py +++ b/smartsim/_core/launcher/stepMapping.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/taskManager.py b/smartsim/_core/launcher/taskManager.py index 2ad84493f2..84123944ee 100644 --- a/smartsim/_core/launcher/taskManager.py +++ b/smartsim/_core/launcher/taskManager.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,12 +29,12 @@ import time import typing as t from subprocess import PIPE -from threading import RLock, Thread +from threading import RLock import psutil from ...error import LauncherError -from ...log import get_logger +from ...log import ContextThread, get_logger from ..utils.helpers import check_dev_log_level from .util.shell import execute_async_cmd, execute_cmd @@ -74,7 +74,7 @@ def start(self) -> None: The TaskManager is run as a daemon thread meaning that it will die when the main thread dies. """ - monitor = Thread(name="TaskManager", daemon=True, target=self.run) + monitor = ContextThread(name="TaskManager", daemon=True, target=self.run) monitor.start() def run(self) -> None: diff --git a/smartsim/_core/launcher/util/__init__.py b/smartsim/_core/launcher/util/__init__.py index bf6fd954c7..efe03908e0 100644 --- a/smartsim/_core/launcher/util/__init__.py +++ b/smartsim/_core/launcher/util/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/util/launcherUtil.py b/smartsim/_core/launcher/util/launcherUtil.py index 9fcd973e33..a24d69e49d 100644 --- a/smartsim/_core/launcher/util/launcherUtil.py +++ b/smartsim/_core/launcher/util/launcherUtil.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/launcher/util/shell.py b/smartsim/_core/launcher/util/shell.py index 1fc243c157..c747bacbca 100644 --- a/smartsim/_core/launcher/util/shell.py +++ b/smartsim/_core/launcher/util/shell.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py index e1123321b0..cb93958810 100644 --- a/smartsim/_core/utils/__init__.py +++ b/smartsim/_core/utils/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index 8d7edf722e..27059e3207 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/utils/network.py b/smartsim/_core/utils/network.py index f18be208ea..69eeb3e1b6 100644 --- a/smartsim/_core/utils/network.py +++ b/smartsim/_core/utils/network.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# Copyright (c) 2021-2024 Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 6c592d0f34..3bcf1c1f20 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -177,6 +177,8 @@ def set_ml_model(db_model: DBModel, client: Client) -> None: outputs=db_model.outputs, ) else: + if db_model.model is None: + raise ValueError(f"No model attacted to {db_model.name}") client.set_model( name=db_model.name, model=db_model.model, @@ -203,7 +205,7 @@ def set_script(db_script: DBScript, client: Client) -> None: client.set_script_from_file( name=db_script.name, file=str(db_script.file), device=device ) - else: + elif db_script.script: if isinstance(db_script.script, str): client.set_script( name=db_script.name, script=db_script.script, device=device @@ -212,7 +214,8 @@ def set_script(db_script: DBScript, client: Client) -> None: client.set_function( name=db_script.name, function=db_script.script, device=device ) - + else: + raise ValueError(f"No script or file attached to {db_script.name}") except RedisReplyError as error: # pragma: no cover logger.error("Error while setting model on orchestrator.") raise error diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py index 75f9aef66d..69840b838c 100644 --- a/smartsim/_core/utils/serialize.py +++ b/smartsim/_core/utils/serialize.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -47,7 +47,7 @@ TStepLaunchMetaData = t.Tuple[ t.Optional[str], t.Optional[str], t.Optional[bool], str, str, Path ] -TELMON_SUBDIR: t.Final[str] = ".smartsim/telemetry" + MANIFEST_FILENAME: t.Final[str] = "manifest.json" _LOGGER = smartsim.log.get_logger(__name__) @@ -58,6 +58,7 @@ def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: return manifest.metadata.run_telemetry_subdirectory.mkdir(parents=True, exist_ok=True) + exp_out, exp_err = smartsim.log.get_exp_log_paths() new_run = { "run_id": manifest.metadata.run_id, @@ -81,12 +82,14 @@ def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: manifest_dict = { "schema info": { "schema_name": "entity manifest", - "version": "0.0.2", + "version": "0.0.3", }, "experiment": { "name": manifest.metadata.exp_name, "path": manifest.metadata.exp_path, "launcher": manifest.metadata.launcher_name, + "out_file": str(exp_out), + "err_file": str(exp_err), }, "runs": [new_run], } diff --git a/smartsim/database/__init__.py b/smartsim/database/__init__.py index f16cf77034..106f8e1e24 100644 --- a/smartsim/database/__init__.py +++ b/smartsim/database/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 07a1a1bfd2..431cb43c5c 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,7 +44,6 @@ from ..settings import ( AprunSettings, BsubBatchSettings, - CobaltBatchSettings, JsrunSettings, MpiexecSettings, MpirunSettings, @@ -64,7 +63,6 @@ "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], "pals": ["mpiexec"], - "cobalt": ["aprun", "mpirun", "mpiexec"], "lsf": ["jsrun"], "local": [""], } @@ -389,7 +387,7 @@ def set_cpus(self, num_cpus: int) -> None: :type num_cpus: int """ if self.batch: - if self.launcher in ["pbs", "cobalt"]: + if self.launcher == "pbs": if hasattr(self, "batch_settings") and self.batch_settings: if hasattr(self.batch_settings, "set_ncpus"): self.batch_settings.set_ncpus(num_cpus) @@ -575,7 +573,7 @@ def set_max_message_size(self, size: int = 1_073_741_824) -> None: """ self.set_db_conf("proto-max-bulk-len", str(size)) - def set_db_conf(self, key: str, value: t.Union[int, str]) -> None: + def set_db_conf(self, key: str, value: str) -> None: """Set any valid configuration at runtime without the need to restart the database. All configuration parameters that are set are immediately loaded by the database and @@ -938,17 +936,6 @@ def _fill_reserved(self) -> None: "chdir", "D", ] - self._reserved_batch_args[CobaltBatchSettings] = [ - "cwd", - "error", - "e", - "output", - "o", - "outputprefix", - "N", - "l", - "jobname", - ] self._reserved_batch_args[QsubBatchSettings] = ["e", "o", "N", "l"] self._reserved_run_args[JsrunSettings] = [ "chdir", diff --git a/smartsim/entity/__init__.py b/smartsim/entity/__init__.py index 4ec28f2d46..4566cd76f0 100644 --- a/smartsim/entity/__init__.py +++ b/smartsim/entity/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 403984d16c..9b67687f07 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index bebedb12c6..0a495f0663 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -33,7 +33,10 @@ __all__ = ["DBObject", "DBModel", "DBScript"] -class DBObject: +_DBObjectFuncT = t.TypeVar("_DBObjectFuncT", str, bytes) + + +class DBObject(t.Generic[_DBObjectFuncT]): """Base class for ML objects residing on DB. Should not be instantiated. """ @@ -41,14 +44,14 @@ class DBObject: def __init__( self, name: str, - func: t.Optional[str], + func: t.Optional[_DBObjectFuncT], file_path: t.Optional[str], device: t.Literal["CPU", "GPU"], devices_per_node: int, first_device: int, ) -> None: self.name = name - self.func = func + self.func: t.Optional[_DBObjectFuncT] = func self.file: t.Optional[Path] = ( None # Need to have this explicitly to check on it ) @@ -65,9 +68,7 @@ def devices(self) -> t.List[str]: @property def is_file(self) -> bool: - if self.func: - return False - return True + return not self.func @staticmethod def _check_tensor_args( @@ -153,7 +154,7 @@ def _check_devices( raise ValueError(msg) -class DBScript(DBObject): +class DBScript(DBObject[str]): def __init__( self, name: str, @@ -214,12 +215,12 @@ def __str__(self) -> str: return desc_str -class DBModel(DBObject): +class DBModel(DBObject[bytes]): def __init__( self, name: str, backend: str, - model: t.Optional[str] = None, + model: t.Optional[bytes] = None, model_file: t.Optional[str] = None, device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, @@ -276,7 +277,7 @@ def __init__( self.inputs, self.outputs = self._check_tensor_args(inputs, outputs) @property - def model(self) -> t.Union[str, None]: + def model(self) -> t.Optional[bytes]: return self.func def __str__(self) -> str: diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index 28ada31dec..b30f825426 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -357,7 +357,7 @@ def add_ml_model( self, name: str, backend: str, - model: t.Optional[str] = None, + model: t.Optional[bytes] = None, model_path: t.Optional[str] = None, device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, @@ -422,6 +422,18 @@ def add_ml_model( inputs=inputs, outputs=outputs, ) + dupe = next( + ( + db_model.name + for ensemble_ml_model in self._db_models + if ensemble_ml_model.name == db_model.name + ), + None, + ) + if dupe: + raise SSUnsupportedError( + f'An ML Model with name "{db_model.name}" already exists' + ) self._db_models.append(db_model) for entity in self.models: self._extend_entity_db_models(entity, [db_model]) @@ -471,6 +483,18 @@ def add_script( devices_per_node=devices_per_node, first_device=first_device, ) + dupe = next( + ( + db_script.name + for ensemble_script in self._db_scripts + if ensemble_script.name == db_script.name + ), + None, + ) + if dupe: + raise SSUnsupportedError( + f'A Script with name "{db_script.name}" already exists' + ) self._db_scripts.append(db_script) for entity in self.models: self._extend_entity_db_scripts(entity, [db_script]) @@ -517,21 +541,78 @@ def add_function( devices_per_node=devices_per_node, first_device=first_device, ) + dupe = next( + ( + db_script.name + for ensemble_script in self._db_scripts + if ensemble_script.name == db_script.name + ), + None, + ) + if dupe: + raise SSUnsupportedError( + f'A Script with name "{db_script.name}" already exists' + ) self._db_scripts.append(db_script) for entity in self.models: self._extend_entity_db_scripts(entity, [db_script]) @staticmethod def _extend_entity_db_models(model: Model, db_models: t.List[DBModel]) -> None: - entity_db_models = [db_model.name for db_model in model.db_models] + """ + Ensures that the Machine Learning model names being added to the Ensemble + are unique. - for db_model in db_models: - if db_model.name not in entity_db_models: - model.add_ml_model_object(db_model) + This static method checks if the provided ML model names already exist in + the Ensemble. An SSUnsupportedError is raised if any duplicate names are + found. Otherwise, it appends the given list of DBModels to the Ensemble. + + :param model: SmartSim Model object. + :type model: Model + :param db_models: List of DBModels to append to the Ensemble. + :type db_models: t.List[DBModel] + """ + for add_ml_model in db_models: + dupe = next( + ( + db_model.name + for db_model in model.db_models + if db_model.name == add_ml_model.name + ), + None, + ) + if dupe: + raise SSUnsupportedError( + f'An ML Model with name "{add_ml_model.name}" already exists' + ) + model.add_ml_model_object(add_ml_model) @staticmethod def _extend_entity_db_scripts(model: Model, db_scripts: t.List[DBScript]) -> None: - entity_db_scripts = [db_script.name for db_script in model.db_scripts] - for db_script in db_scripts: - if not db_script.name in entity_db_scripts: - model.add_script_object(db_script) + """ + Ensures that the script/function names being added to the Ensemble are unique. + + This static method checks if the provided script/function names already exist + in the Ensemble. An SSUnsupportedError is raised if any duplicate names + are found. Otherwise, it appends the given list of DBScripts to the + Ensemble. + + :param model: SmartSim Model object. + :type model: Model + :param db_scripts: List of DBScripts to append to the Ensemble. + :type db_scripts: t.List[DBScript] + """ + for add_script in db_scripts: + dupe = next( + ( + add_script.name + for db_script in model.db_scripts + if db_script.name == add_script.name + ), + None, + ) + if dupe: + raise SSUnsupportedError( + f'A Script with name "{add_script.name}" already exists' + ) + model.add_script_object(add_script) diff --git a/smartsim/entity/entity.py b/smartsim/entity/entity.py index 0d126c907a..46202ca6a1 100644 --- a/smartsim/entity/entity.py +++ b/smartsim/entity/entity.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/entity/entityList.py b/smartsim/entity/entityList.py index 4eaf3faa0b..6d958bda6e 100644 --- a/smartsim/entity/entityList.py +++ b/smartsim/entity/entityList.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/entity/files.py b/smartsim/entity/files.py index 3aae9402b2..9c282b94e5 100644 --- a/smartsim/entity/files.py +++ b/smartsim/entity/files.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 6b97cbf2e5..c7b8731c20 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -258,11 +258,11 @@ def colocate_db_uds( f"Invalid name for unix socket: {unix_socket}. Must only " "contain alphanumeric characters or . : _ - /" ) - - uds_options = { + uds_options: t.Dict[str, t.Union[int, str]] = { "unix_socket": unix_socket, "socket_permissions": socket_permissions, - "port": 0, # This is hardcoded to 0 as recommended by redis for UDS + # This is hardcoded to 0 as recommended by redis for UDS + "port": 0, } common_options = { @@ -332,9 +332,18 @@ def colocate_db_tcp( def _set_colocated_db_settings( self, - connection_options: t.Dict[str, t.Any], - common_options: t.Dict[str, t.Any], - **kwargs: t.Any, + connection_options: t.Mapping[str, t.Union[int, t.List[str], str]], + common_options: t.Dict[ + str, + t.Union[ + t.Union[t.Iterable[t.Union[int, t.Iterable[int]]], None], + bool, + int, + str, + None, + ], + ], + **kwargs: t.Union[int, None], ) -> None: """ Ingest the connection-specific options (UDS/TCP) and set the final settings @@ -357,21 +366,42 @@ def _set_colocated_db_settings( ) # TODO list which db settings can be extras + custom_pinning_ = t.cast( + t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]], + common_options.get("custom_pinning"), + ) + cpus_ = t.cast(int, common_options.get("cpus")) common_options["custom_pinning"] = self._create_pinning_string( - common_options["custom_pinning"], common_options["cpus"] + custom_pinning_, cpus_ ) - colo_db_config = {} + colo_db_config: t.Dict[ + str, + t.Union[ + bool, + int, + str, + None, + t.List[str], + t.Iterable[t.Union[int, t.Iterable[int]]], + t.List[DBModel], + t.List[DBScript], + t.Dict[str, t.Union[int, None]], + t.Dict[str, str], + ], + ] = {} colo_db_config.update(connection_options) colo_db_config.update(common_options) - # redisai arguments for inference settings - colo_db_config["rai_args"] = { + + redis_ai_temp = { "threads_per_queue": kwargs.get("threads_per_queue", None), "inter_op_parallelism": kwargs.get("inter_op_parallelism", None), "intra_op_parallelism": kwargs.get("intra_op_parallelism", None), } + # redisai arguments for inference settings + colo_db_config["rai_args"] = redis_ai_temp colo_db_config["extra_db_args"] = { - k: str(v) for k, v in kwargs.items() if k not in colo_db_config["rai_args"] + k: str(v) for k, v in kwargs.items() if k not in redis_ai_temp } self._check_db_objects_colo() @@ -455,7 +485,7 @@ def add_ml_model( self, name: str, backend: str, - model: t.Optional[str] = None, + model: t.Optional[bytes] = None, model_path: t.Optional[str] = None, device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, diff --git a/smartsim/entity/strategies.py b/smartsim/entity/strategies.py index e9db30c8fb..2af88b58e7 100644 --- a/smartsim/entity/strategies.py +++ b/smartsim/entity/strategies.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/error/__init__.py b/smartsim/error/__init__.py index a04f5d91e0..4268905e69 100644 --- a/smartsim/error/__init__.py +++ b/smartsim/error/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/error/errors.py b/smartsim/error/errors.py index d71ae3f71e..9a6954907e 100644 --- a/smartsim/error/errors.py +++ b/smartsim/error/errors.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 9de33419aa..9fcc7b13ea 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,18 +31,30 @@ from tabulate import tabulate +from smartsim.error.errors import SSUnsupportedError + from ._core import Controller, Generator, Manifest from ._core.utils import init_default from .database import Orchestrator from .entity import Ensemble, Model, SmartSimEntity from .error import SmartSimError -from .log import get_logger +from .log import ctx_exp_path, get_logger, method_contextualizer from .settings import Container, base, settings from .wlm import detect_launcher logger = get_logger(__name__) +def _exp_path_map(exp: "Experiment") -> str: + """Mapping function for use by method contextualizer to place the path of + the currently-executing experiment into context for log enrichment""" + return exp.exp_path + + +_contextualize = method_contextualizer(ctx_exp_path, _exp_path_map) + + +# pylint: disable=no-self-use class Experiment: """Experiments are the Python user interface for SmartSim. @@ -110,7 +122,7 @@ def __init__( :param exp_path: path to location of ``Experiment`` directory if generated :type exp_path: str, optional :param launcher: type of launcher being used, options are "slurm", "pbs", - "cobalt", "lsf", or "local". If set to "auto", + "lsf", or "local". If set to "auto", an attempt will be made to find an available launcher on the system. Defaults to "local" @@ -123,15 +135,18 @@ def __init__( if not osp.isdir(osp.abspath(exp_path)): raise NotADirectoryError("Experiment path provided does not exist") exp_path = osp.abspath(exp_path) - self.exp_path = init_default(osp.join(getcwd(), name), exp_path, str) + self.exp_path: str = init_default(osp.join(getcwd(), name), exp_path, str) if launcher == "auto": launcher = detect_launcher() + if launcher == "cobalt": + raise SSUnsupportedError("Cobalt launcher is no longer supported.") self._control = Controller(launcher=launcher) self._launcher = launcher.lower() self.db_identifiers: t.Set[str] = set() + @_contextualize def start( self, *args: t.Any, @@ -205,6 +220,7 @@ def start( logger.error(e) raise + @_contextualize def stop(self, *args: t.Any) -> None: """Stop specific instances launched by this ``Experiment`` @@ -241,6 +257,7 @@ def stop(self, *args: t.Any) -> None: logger.error(e) raise + @_contextualize def generate( self, *args: t.Any, @@ -278,6 +295,7 @@ def generate( logger.error(e) raise + @_contextualize def poll( self, interval: int = 10, verbose: bool = True, kill_on_interrupt: bool = True ) -> None: @@ -321,6 +339,7 @@ def poll( logger.error(e) raise + @_contextualize def finished(self, entity: SmartSimEntity) -> bool: """Query if a job has completed. @@ -344,6 +363,7 @@ def finished(self, entity: SmartSimEntity) -> bool: logger.error(e) raise + @_contextualize def get_status(self, *args: t.Any) -> t.List[str]: """Query the status of launched instances @@ -382,8 +402,9 @@ def get_status(self, *args: t.Any) -> t.List[str]: logger.error(e) raise - @staticmethod + @_contextualize def create_ensemble( + self, name: str, params: t.Optional[t.Dict[str, t.Any]] = None, batch_settings: t.Optional[base.BatchSettings] = None, @@ -456,8 +477,9 @@ def create_ensemble( logger.error(e) raise - @staticmethod + @_contextualize def create_model( + self, name: str, run_settings: base.RunSettings, params: t.Optional[t.Dict[str, t.Any]] = None, @@ -553,7 +575,6 @@ def create_model( """ path = init_default(getcwd(), path, str) - # mcb if path is None: path = getcwd() if params is None: @@ -570,6 +591,7 @@ def create_model( logger.error(e) raise + @_contextualize def create_run_settings( self, exe: str, @@ -634,6 +656,7 @@ class in SmartSim. If found, the class corresponding logger.error(e) raise + @_contextualize def create_batch_settings( self, nodes: int = 1, @@ -694,6 +717,7 @@ def create_batch_settings( logger.error(e) raise + @_contextualize def create_database( self, port: int = 6379, @@ -777,6 +801,7 @@ def create_database( **kwargs, ) + @_contextualize def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator: """Reconnect to a running ``Orchestrator`` @@ -797,6 +822,7 @@ def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator: logger.error(e) raise + @_contextualize def summary(self, style: str = "github") -> str: """Return a summary of the ``Experiment`` diff --git a/smartsim/log.py b/smartsim/log.py index baf54f0683..55cb88afb3 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -23,27 +23,48 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import functools import logging -import os +import pathlib import sys +import threading import typing as t +from contextvars import ContextVar, copy_context import coloredlogs +from smartsim._core.config import CONFIG + # constants DEFAULT_DATE_FORMAT: t.Final[str] = "%H:%M:%S" DEFAULT_LOG_FORMAT: t.Final[str] = ( "%(asctime)s %(hostname)s %(name)s[%(process)d] %(levelname)s %(message)s" ) +EXPERIMENT_LOG_FORMAT = DEFAULT_LOG_FORMAT.replace("s[%", "s {%(exp_path)s} [%") # configure colored loggs coloredlogs.DEFAULT_DATE_FORMAT = DEFAULT_DATE_FORMAT coloredlogs.DEFAULT_LOG_FORMAT = DEFAULT_LOG_FORMAT +# create context vars used by loggers +ctx_exp_path = ContextVar("exp_path", default="") + + +# Generic types for method contextualizers +_T = t.TypeVar("_T") +_RT = t.TypeVar("_RT") +_ContextT = t.TypeVar("_ContextT") + +if t.TYPE_CHECKING: + from typing_extensions import Concatenate, ParamSpec -def _get_log_level() -> str: - """Get the logging level based on environment variable - SMARTSIM_LOG_LEVEL. If not set, default to info. + _PR = ParamSpec("_PR") + + +def _translate_log_level(user_log_level: str = "info") -> str: + """Translate value of CONFIG.log_level to one + accepted as ``level`` option by Python's logging module. Logging levels - quiet: Just shows errors and warnings @@ -52,22 +73,106 @@ def _get_log_level() -> str: - developer: Shows everything happening during execution extremely verbose logging. - :return: Log level for coloredlogs + :param user_log_level: log level specified by user, defaults to info + :type user_log_level: str + :returns: Log level for coloredlogs :rtype: str """ - log_level = os.environ.get("SMARTSIM_LOG_LEVEL", "info").lower() - if log_level == "quiet": + user_log_level = user_log_level.lower() + if user_log_level in ["info", "debug", "warning"]: + return user_log_level + if user_log_level == "quiet": return "warning" - if log_level == "info": - return "info" - if log_level == "debug": - return "debug" # extremely verbose logging used internally - if log_level == "developer": + if user_log_level == "developer": return "debug" return "info" +def get_exp_log_paths() -> t.Tuple[t.Optional[pathlib.Path], t.Optional[pathlib.Path]]: + """Returns the output and error file paths to experiment logs. + Returns None for both paths if experiment context is unavailable. + + :returns: 2-tuple of paths to experiment logs in form (output_path, error_path) + if telemetry is enabled, a 2-tuple of None otherwise + :rtype: Tuple[pathlib.Path | None, pathlib.Path | None] + """ + default_paths = None, None + + if not CONFIG.telemetry_enabled: + return default_paths + + if _exp_path := ctx_exp_path.get(): + file_out = pathlib.Path(_exp_path) / CONFIG.telemetry_subdir / "smartsim.out" + file_err = pathlib.Path(_exp_path) / CONFIG.telemetry_subdir / "smartsim.err" + return file_out, file_err + + return default_paths + + +class ContextThread(threading.Thread): + """Thread that ensures the context vars of the caller are available""" + + def run(self) -> None: + """Execute a thread on a copy of the current thread context""" + ctx = copy_context() + return ctx.run(super().run) + + +class ContextInjectingLogFilter(logging.Filter): + """Filter that performs enrichment of a log record by adding context + information about the experiment being executed""" + + def filter(self, record: logging.LogRecord) -> bool: + """Enrich log records with active experiment context + + :param record: the record to evaluate for filtering + :type record: logging.LogRecord + :returns: always True + :rtype: bool + """ + record.exp_path = ctx_exp_path.get() + return True + + +class ContextAwareLogger(logging.Logger): + """A logger customized to automatically write experiment logs to a + dynamic target directory by inspecting the value of a context var""" + + def __init__(self, name: str, level: t.Union[int, str] = 0) -> None: + super().__init__(name, level) + self.addFilter(ContextInjectingLogFilter(name="exp-ctx-log-filter")) + + def _log( + self, + level: int, + msg: object, + args: t.Any, + exc_info: t.Optional[t.Any] = None, + extra: t.Optional[t.Any] = None, + stack_info: bool = False, + stacklevel: int = 1, + ) -> None: + """Automatically attach file handlers if contextual information is found""" + file_out, file_err = get_exp_log_paths() + + if not all([file_out, file_err]): + super()._log(level, msg, args, exc_info, extra, stack_info, stacklevel) + return + + _lvl = logging.getLevelName(self.level) + fmt = EXPERIMENT_LOG_FORMAT + + low_pass = LowPassFilter(_lvl) + h_out = log_to_exp_file(str(file_out), self, _lvl, fmt, low_pass) + h_err = log_to_exp_file(str(file_err), self, "WARN", fmt) + + super()._log(level, msg, args, exc_info, extra, stack_info, stacklevel) + + for handler in [h_out, h_err]: + self.removeHandler(handler) + + def get_logger( name: str, log_level: t.Optional[str] = None, fmt: t.Optional[str] = None ) -> logging.Logger: @@ -99,19 +204,46 @@ def get_logger( """ # if name is None, then logger is the root logger # if not root logger, get the name of file without prefix. - user_log_level = _get_log_level() + user_log_level = CONFIG.log_level if user_log_level != "developer": name = "SmartSim" + logging.setLoggerClass(ContextAwareLogger) logger = logging.getLogger(name) if log_level: logger.setLevel(log_level) else: - log_level = user_log_level + log_level = _translate_log_level(user_log_level) coloredlogs.install(level=log_level, logger=logger, fmt=fmt, stream=sys.stdout) return logger +class LowPassFilter(logging.Filter): + """A filter that passes all records below a specified level""" + + def __init__(self, maximum_level: str = "INFO"): + """Create a low-pass log filter allowing messages below a specific log level + + :param maximum_level: The maximum log level to be passed by the filter + :type maximum_level: str + """ + super().__init__() + self.max = maximum_level + + def filter(self, record: logging.LogRecord) -> bool: + """Filter log records; pass those less than or equal to the maximum level + + :param record: the record to evaluate for filtering + :type record: logging.LogRecord + :returns: True if record level passes filter, False otherwise + :rtype: bool + """ + # If a string representation of the level is passed in, + # the corresponding numeric value is returned. + level_no: int = logging.getLevelName(self.max) + return record.levelno <= level_no + + def log_to_file(filename: str, log_level: str = "debug") -> None: """Installs a second filestream handler to the root logger, allowing subsequent logging calls to be sent to filename. @@ -122,10 +254,106 @@ def log_to_file(filename: str, log_level: str = "debug") -> None: :param log_level: as defined in get_logger. Can be specified to allow the file to store more or less verbose logging information. - :type log_level: int | str + :type log_level: str """ logger = logging.getLogger("SmartSim") stream = open( # pylint: disable=consider-using-with filename, "w+", encoding="utf-8" ) coloredlogs.install(stream=stream, logger=logger, level=log_level) + + +def log_to_exp_file( + filename: str, + logger: logging.Logger, + log_level: str = "warn", + fmt: t.Optional[str] = EXPERIMENT_LOG_FORMAT, + log_filter: t.Optional[logging.Filter] = None, +) -> logging.Handler: + """Installs a second filestream handler to the root logger, + allowing subsequent logging calls to be sent to filename. + + :param filename: the name of the desired log file. + :type filename: str + :param log_level: as defined in get_logger. Can be specified + to allow the file to store more or less verbose + logging information. + :type log_level: int | str + :param logger: an existing logger to add the handler to + :type logger: (optional) logging.Logger + :param fmt: a log format for the handler (otherwise, EXPERIMENT_LOG_FORMAT) + :type fmt: (optional) str + :param log_filter: log filter to attach to handler + :type log_filter: (optional) logging.Filter + :return: logging.Handler + :rtype: logging.Handler + """ + # ensure logs are written even if specified dir doesn't exist + log_path = pathlib.Path(filename) + if not log_path.parent.exists(): + log_path.parent.mkdir(parents=True, exist_ok=True) + + handler = logging.FileHandler(filename, mode="a+", encoding="utf-8") + + if log_filter: + handler.addFilter(log_filter) + + formatter = logging.Formatter(fmt=fmt, datefmt=DEFAULT_DATE_FORMAT) + + handler.setFormatter(formatter) + handler.setLevel(log_level.upper()) + + logger.addHandler(handler) + return handler + + +def method_contextualizer( + ctx_var: ContextVar[_ContextT], + ctx_map: t.Callable[[_T], _ContextT], +) -> """t.Callable[ + [t.Callable[Concatenate[_T, _PR], _RT]], + t.Callable[Concatenate[_T, _PR], _RT], +]""": + """Parameterized-decorator factory that enables a target value + to be placed into global context prior to execution of the + decorated method. + Usage Note: the use of `self` below requires that the decorated function is passed + the object containing a value that will be modified in the context. `ctx_map` + must accept an instance of matching type. + + :param ctx_var: The ContextVar that will be modified + :type ctx_var: ContextVar + :param ctx_map: A function that returns the value to be set to ctx_var + :type ctx_map: t.Callable[[_T], _ContextT]""" + + def _contextualize( + fn: "t.Callable[Concatenate[_T, _PR], _RT]", / + ) -> "t.Callable[Concatenate[_T, _PR], _RT]": + """Executes the decorated method in a cloned context and ensures + `ctx_var` is updated to the value returned by `ctx_map` prior to + calling the decorated method""" + + @functools.wraps(fn) + def _contextual( + self: _T, + *args: "_PR.args", + **kwargs: "_PR.kwargs", + ) -> _RT: + """A decorator operator that runs the decorated method in a new + context with the desired contextual information modified.""" + + def _ctx_modifier() -> _RT: + """Helper to simplify calling the target method with the + modified value set in `ctx_var`""" + ctx_val = ctx_map(self) + token = ctx_var.set(ctx_val) + result = fn(self, *args, **kwargs) + ctx_var.reset(token) + return result + + ctx = copy_context() + return ctx.run(_ctx_modifier) + + return _contextual + + return _contextualize diff --git a/smartsim/ml/__init__.py b/smartsim/ml/__init__.py index 84fd06b578..eb74c59571 100644 --- a/smartsim/ml/__init__.py +++ b/smartsim/ml/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/ml/data.py b/smartsim/ml/data.py index 3dfca9f0cb..4cdc27c060 100644 --- a/smartsim/ml/data.py +++ b/smartsim/ml/data.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -35,6 +35,9 @@ from ..error import SSInternalError from ..log import get_logger +if t.TYPE_CHECKING: + import numpy.typing as npt + logger = get_logger(__name__) @@ -118,7 +121,7 @@ def download(self, client: Client) -> None: if "target_name" in field_names: self.target_name = info_ds.get_meta_strings("target_name")[0] if "num_classes" in field_names: - self.num_classes = info_ds.get_meta_scalars("num_classes")[0] + self.num_classes = int(info_ds.get_meta_scalars("num_classes")[0]) def __repr__(self) -> str: strings = ["DataInfo object"] @@ -311,8 +314,8 @@ def __init__( self.address = address self.cluster = cluster self.verbose = verbose - self.samples = None - self.targets = None + self.samples: t.Optional["npt.NDArray[t.Any]"] = None + self.targets: t.Optional["npt.NDArray[t.Any]"] = None self.num_samples = 0 self.indices = np.arange(0) self.shuffle = shuffle @@ -460,14 +463,20 @@ def _add_samples(self, indices: t.List[int]) -> None: if self.samples is not None: for dataset in datasets: self.samples = np.concatenate( - (self.samples, dataset.get_tensor(self.sample_name)) + ( + t.cast("npt.NDArray[t.Any]", self.samples), + dataset.get_tensor(self.sample_name), + ) ) if self.need_targets: self.targets = np.concatenate( - (self.targets, dataset.get_tensor(self.target_name)) + ( + t.cast("npt.NDArray[t.Any]", self.targets), + dataset.get_tensor(self.target_name), + ) ) - self.num_samples = self.samples.shape[0] + self.num_samples = t.cast("npt.NDArray[t.Any]", self.samples).shape[0] self.indices = np.arange(self.num_samples) self.log(f"New dataset size: {self.num_samples}, batches: {len(self)}") @@ -496,8 +505,8 @@ def update_data(self) -> None: np.random.shuffle(self.indices) def _data_generation( - self, indices: np.ndarray # type: ignore[type-arg] - ) -> t.Tuple[np.ndarray, np.ndarray]: # type: ignore[type-arg] + self, indices: "npt.NDArray[t.Any]" + ) -> t.Tuple["npt.NDArray[t.Any]", "npt.NDArray[t.Any]"]: # Initialization if self.samples is None: raise ValueError("Samples have not been initialized") @@ -505,10 +514,10 @@ def _data_generation( xval = self.samples[indices] if self.need_targets: - yval = self.targets[indices] + yval = t.cast("npt.NDArray[t.Any]", self.targets)[indices] elif self.autoencoding: yval = xval else: - return xval + return xval # type: ignore[no-any-return] return xval, yval diff --git a/smartsim/ml/tf/__init__.py b/smartsim/ml/tf/__init__.py index 2f6646dbda..eb3cb565e1 100644 --- a/smartsim/ml/tf/__init__.py +++ b/smartsim/ml/tf/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/ml/tf/data.py b/smartsim/ml/tf/data.py index ae0b9aadd1..ffc9695511 100644 --- a/smartsim/ml/tf/data.py +++ b/smartsim/ml/tf/data.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,6 +31,9 @@ from smartsim.ml import DataDownloader +if t.TYPE_CHECKING: + import numpy.typing as npt + class _TFDataGenerationCommon(DataDownloader, keras.utils.Sequence): def __getitem__( @@ -60,7 +63,9 @@ def on_epoch_end(self) -> None: if self.shuffle: np.random.shuffle(self.indices) - def _data_generation(self, indices: np.ndarray) -> t.Tuple[np.ndarray, np.ndarray]: # type: ignore[type-arg] + def _data_generation( + self, indices: "npt.NDArray[t.Any]" + ) -> t.Tuple["npt.NDArray[t.Any]", "npt.NDArray[t.Any]"]: # Initialization if self.samples is None: raise ValueError("No samples loaded for data generation") @@ -68,13 +73,13 @@ def _data_generation(self, indices: np.ndarray) -> t.Tuple[np.ndarray, np.ndarra xval = self.samples[indices] if self.need_targets: - yval = self.targets[indices] + yval = t.cast("npt.NDArray[t.Any]", self.targets)[indices] if self.num_classes is not None: yval = keras.utils.to_categorical(yval, num_classes=self.num_classes) elif self.autoencoding: yval = xval else: - return xval + return xval # type: ignore[no-any-return] return xval, yval diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index c8018ac322..69c8e25808 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/ml/torch/__init__.py b/smartsim/ml/torch/__init__.py index b90a6ffdb0..fcc0f2a1fa 100644 --- a/smartsim/ml/torch/__init__.py +++ b/smartsim/ml/torch/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/ml/torch/data.py b/smartsim/ml/torch/data.py index 166a29e055..c6a8e6eac5 100644 --- a/smartsim/ml/torch/data.py +++ b/smartsim/ml/torch/data.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/py.typed b/smartsim/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/servertype.py b/smartsim/servertype.py index a83149c237..06d0bc8e52 100644 --- a/smartsim/servertype.py +++ b/smartsim/servertype.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/__init__.py b/smartsim/settings/__init__.py index 542aeab1d2..d417c9ef8f 100644 --- a/smartsim/settings/__init__.py +++ b/smartsim/settings/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,7 +26,6 @@ from .alpsSettings import AprunSettings from .base import RunSettings, SettingsBase -from .cobaltSettings import CobaltBatchSettings from .containers import Container, Singularity from .lsfSettings import BsubBatchSettings, JsrunSettings from .mpiSettings import MpiexecSettings, MpirunSettings, OrterunSettings @@ -36,7 +35,6 @@ __all__ = [ "AprunSettings", - "CobaltBatchSettings", "BsubBatchSettings", "JsrunSettings", "MpirunSettings", diff --git a/smartsim/settings/alpsSettings.py b/smartsim/settings/alpsSettings.py index b36c3d3339..5357312a5d 100644 --- a/smartsim/settings/alpsSettings.py +++ b/smartsim/settings/alpsSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -43,8 +43,7 @@ def __init__( ): """Settings to run job with ``aprun`` command - ``AprunSettings`` can be used for both the `pbs` and `cobalt` - launchers. + ``AprunSettings`` can be used for the `pbs` launcher. :param exe: executable :type exe: str diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index a6df4eed4c..284d435c0f 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -1,5 +1,5 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,6 +30,7 @@ from smartsim.settings.containers import Container from .._core.utils.helpers import expand_exe_path, fmt_dict, is_valid_cmd +from ..entity.dbobject import DBModel, DBScript from ..log import get_logger logger = get_logger(__name__) @@ -96,7 +97,23 @@ def __init__( self.container = container self._run_command = run_command self.in_batch = False - self.colocated_db_settings: t.Optional[t.Dict[str, str]] = None + self.colocated_db_settings: t.Optional[ + t.Dict[ + str, + t.Union[ + bool, + int, + str, + None, + t.List[str], + t.Iterable[t.Union[int, t.Iterable[int]]], + t.List[DBModel], + t.List[DBScript], + t.Dict[str, t.Union[int, None]], + t.Dict[str, str], + ], + ] + ] = None @property def exe_args(self) -> t.Union[str, t.List[str]]: diff --git a/smartsim/settings/cobaltSettings.py b/smartsim/settings/cobaltSettings.py deleted file mode 100644 index 5a0e07b409..0000000000 --- a/smartsim/settings/cobaltSettings.py +++ /dev/null @@ -1,171 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import typing as t - -from .base import BatchSettings - - -class CobaltBatchSettings(BatchSettings): - def __init__( - self, - nodes: t.Optional[int] = None, - time: str = "", - queue: t.Optional[str] = None, - account: t.Optional[str] = None, - batch_args: t.Optional[t.Dict[str, t.Optional[str]]] = None, - **kwargs: t.Any, - ) -> None: - """Specify settings for a Cobalt ``qsub`` batch launch - - If the argument doesn't have a parameter, put None - as the value. e.g. {'exclusive': None} - - Initialization values provided (nodes, time, account) - will overwrite the same arguments in ``batch_args`` if present - - :param nodes: number of nodes, defaults to None - :type nodes: int, optional - :param time: walltime for job, e.g. "10:00:00" for 10 hours, - defaults to empty str - :type time: str, optional - :param queue: queue to launch job in, defaults to None - :type queue: str, optional - :param account: account for job, defaults to None - :type account: str, optional - :param batch_args: extra batch arguments, defaults to None - :type batch_args: dict[str, str], optional - """ - super().__init__( - "qsub", - batch_args=batch_args, - nodes=nodes, - account=account, - queue=queue, - time=time, - **kwargs, - ) - - def set_walltime(self, walltime: str) -> None: - """Set the walltime of the job - - format = "HH:MM:SS" - - Cobalt walltime can also be specified with number - of minutes. - - :param walltime: wall time - :type walltime: str - """ - # TODO check for formatting errors here - # TODO catch existing "t" in batch_args - if walltime: - self.batch_args["time"] = walltime - - def set_nodes(self, num_nodes: int) -> None: - """Set the number of nodes for this batch job - - :param num_nodes: number of nodes - :type num_nodes: int - """ - # TODO catch existing "n" in batch_args - if num_nodes: - self.batch_args["nodecount"] = str(int(num_nodes)) - - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: - """Specify the hostlist for this job - - :param host_list: hosts to launch on - :type host_list: str | list[str] - :raises TypeError: if not str or list of str - """ - if isinstance(host_list, str): - host_list = [host_list.strip()] - if not isinstance(host_list, list): - raise TypeError("host_list argument must be a list of strings") - if not all(isinstance(host, str) for host in host_list): - raise TypeError("host_list argument must be list of strings") - hosts = ",".join(host_list) - self.batch_args["attrs"] = f"location={hosts}" - - def set_tasks(self, num_tasks: int) -> None: - """Set total number of processes to start - - :param num_tasks: number of processes - :type num_tasks: int - """ - self.batch_args["proccount"] = str(int(num_tasks)) - - def set_queue(self, queue: str) -> None: - """Set the queue for the batch job - - :param queue: queue name - :type queue: str - """ - # TODO catch existing "q" in batch args - if queue: - self.batch_args["queue"] = str(queue) - - def set_account(self, account: str) -> None: - """Set the account for this batch job - - :param acct: account id - :type acct: str - """ - # TODO catch existing "A" in batch_args - if account: - self.batch_args["project"] = account - - def format_batch_args(self) -> t.List[str]: - """Get the formatted batch arguments for a preview - - :return: list of batch arguments for Sbatch - :rtype: list[str] - """ - restricted = [ - "o", - "output", # output is determined by interface - "O", - "outputprefix", # step name is output prefix - "e", - "error", # error is determined by interface - "cwd", # cwd is determined by interface - "jobname", # step name is jobname - ] - opts = [] - for opt, value in self.batch_args.items(): - if opt not in restricted: - # attach "-" prefix if argument is 1 character otherwise "--" - short_arg = bool(len(str(opt)) == 1) - prefix = "-" if short_arg else "--" - if not value: - opts += [prefix + opt] - else: - if short_arg: - opts += [prefix + opt, str(value)] - else: - opts += [" ".join((prefix + opt, str(value)))] - return opts diff --git a/smartsim/settings/containers.py b/smartsim/settings/containers.py index 6d5a72f801..bdba1ce889 100644 --- a/smartsim/settings/containers.py +++ b/smartsim/settings/containers.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/lsfSettings.py b/smartsim/settings/lsfSettings.py index 47fe918027..32902c8c61 100644 --- a/smartsim/settings/lsfSettings.py +++ b/smartsim/settings/lsfSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -97,7 +97,7 @@ def set_cpus_per_rs(self, cpus_per_rs: int) -> None: :type cpus_per_rs: int or str """ if self.colocated_db_settings: - db_cpus = int(self.colocated_db_settings.get("db_cpus", 0)) + db_cpus = int(t.cast(int, self.colocated_db_settings.get("db_cpus", 0))) if not db_cpus: raise ValueError("db_cpus must be configured on colocated_db_settings") diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py index 5b6b520e3d..ce132bcc50 100644 --- a/smartsim/settings/mpiSettings.py +++ b/smartsim/settings/mpiSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/mpirunSettings.py b/smartsim/settings/mpirunSettings.py index b290e23558..994d62bba9 100644 --- a/smartsim/settings/mpirunSettings.py +++ b/smartsim/settings/mpirunSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index bcfee1ff14..e43cd94667 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 0a4b0868a2..19a58b11c5 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index b09286e8c8..6e6172507e 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +31,6 @@ from ..settings import ( AprunSettings, BsubBatchSettings, - CobaltBatchSettings, Container, JsrunSettings, MpiexecSettings, @@ -81,7 +80,6 @@ def create_batch_settings( """ # all supported batch class implementations by_launcher: t.Dict[str, t.Callable[..., base.BatchSettings]] = { - "cobalt": CobaltBatchSettings, "pbs": QsubBatchSettings, "slurm": SbatchSettings, "lsf": BsubBatchSettings, @@ -164,7 +162,6 @@ def create_run_settings( "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], "pals": ["mpiexec"], - "cobalt": ["aprun", "mpirun", "mpiexec"], "lsf": ["jsrun", "mpirun", "mpiexec"], "local": [""], } diff --git a/smartsim/settings/slurmSettings.py b/smartsim/settings/slurmSettings.py index 8da8659e1d..935a8df39f 100644 --- a/smartsim/settings/slurmSettings.py +++ b/smartsim/settings/slurmSettings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -256,13 +256,9 @@ def _fmt_walltime(hours: int, minutes: int, seconds: int) -> str: :param seconds: number of seconds to run job :type seconds: int :returns: Formatted walltime - :rtype + :rtype: str """ - delta = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) - fmt_str = str(delta) - if delta.seconds // 3600 < 10: - fmt_str = "0" + fmt_str - return fmt_str + return fmt_walltime(hours, minutes, seconds) def set_walltime(self, walltime: str) -> None: """Set the walltime of the job @@ -390,6 +386,27 @@ def format_comma_sep_env_vars(self) -> t.Tuple[str, t.List[str]]: return fmt_exported_env, compound_env +def fmt_walltime(hours: int, minutes: int, seconds: int) -> str: + """Helper function walltime format conversion + + Converts time to format HH:MM:SS + + :param hours: number of hours to run job + :type hours: int + :param minutes: number of minutes to run job + :type minutes: int + :param seconds: number of seconds to run job + :type seconds: int + :returns: Formatted walltime + :rtype: str + """ + delta = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) + fmt_str = str(delta) + if delta.seconds // 3600 < 10: + fmt_str = "0" + fmt_str + return fmt_str + + class SbatchSettings(BatchSettings): def __init__( self, diff --git a/smartsim/slurm.py b/smartsim/slurm.py index 105800a147..6a32d0213a 100644 --- a/smartsim/slurm.py +++ b/smartsim/slurm.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/status.py b/smartsim/status.py index 74d440b8e4..409ec8c1ae 100644 --- a/smartsim/status.py +++ b/smartsim/status.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/wlm/__init__.py b/smartsim/wlm/__init__.py index d7dd298bee..3a82a81e56 100644 --- a/smartsim/wlm/__init__.py +++ b/smartsim/wlm/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -36,7 +36,7 @@ def detect_launcher() -> str: """Detect available launcher.""" - # Precedence: PBS, Cobalt, LSF, Slurm, local + # Precedence: PBS, LSF, Slurm, local if which("qsub") and which("qstat") and which("qdel"): qsub_version = run( ["qsub", "--version"], @@ -47,8 +47,6 @@ def detect_launcher() -> str: ) if "pbs" in (qsub_version.stdout).lower(): return "pbs" - if "cobalt" in (qsub_version.stdout).lower(): - return "cobalt" if all( [which("bsub"), which("jsrun"), which("jslist"), which("bjobs"), which("bkill")] ): @@ -66,9 +64,7 @@ def detect_launcher() -> str: ): return "slurm" # Systems like ThetaGPU don't have - # Cobalt or PBS on compute nodes - if "COBALT_JOBID" in os.environ: - return "cobalt" + # PBS on compute nodes if "PBS_JOBID" in os.environ: return "pbs" return "local" diff --git a/smartsim/wlm/pbs.py b/smartsim/wlm/pbs.py index 995ba5fc9f..eda5baf244 100644 --- a/smartsim/wlm/pbs.py +++ b/smartsim/wlm/pbs.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/smartsim/wlm/slurm.py b/smartsim/wlm/slurm.py index ba46fb64c8..9308eea98f 100644 --- a/smartsim/wlm/slurm.py +++ b/smartsim/wlm/slurm.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -39,6 +39,7 @@ SSReservedKeywordError, ) from ..log import get_logger +from ..settings.slurmSettings import fmt_walltime logger = get_logger(__name__) @@ -248,7 +249,7 @@ def _get_alloc_cmd( "SmartSim", ] if time: - salloc_args.extend(["-t", time]) + salloc_args.extend(["-t", _validate_time_format(time)]) if account: salloc_args.extend(["-A", str(account)]) @@ -273,6 +274,25 @@ def _get_alloc_cmd( return salloc_args +def _validate_time_format(time: str) -> str: + """Convert time into valid walltime format + + By defualt the formatted wall time is the total number of seconds. + + :param time: number of hours to run job + :type time: str + :returns: Formatted walltime + :rtype: str + """ + try: + hours, minutes, seconds = map(int, time.split(":")) + except ValueError as e: + raise ValueError( + "Input time must be formatted as `HH:MM:SS` with valid Integers." + ) from e + return fmt_walltime(hours, minutes, seconds) + + def get_hosts() -> t.List[str]: """Get the name of the nodes used in a slurm allocation. diff --git a/tests/__init__.py b/tests/__init__.py index bf6fd954c7..efe03908e0 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/run_sklearn_onnx.py b/tests/backends/run_sklearn_onnx.py index d4377bbdee..f10c8c7fb1 100644 --- a/tests/backends/run_sklearn_onnx.py +++ b/tests/backends/run_sklearn_onnx.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/run_tf.py b/tests/backends/run_tf.py index c9cf0ee04b..ec5d0142bb 100644 --- a/tests/backends/run_tf.py +++ b/tests/backends/run_tf.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/run_torch.py b/tests/backends/run_torch.py index d57cfad9df..6e9ba28598 100644 --- a/tests/backends/run_torch.py +++ b/tests/backends/run_torch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/test_cli_mini_exp.py b/tests/backends/test_cli_mini_exp.py index 7c793e9158..f02f44270c 100644 --- a/tests/backends/test_cli_mini_exp.py +++ b/tests/backends/test_cli_mini_exp.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index 37c4296efa..d02f3f33c9 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 1cfc1efcb6..75e9f515d2 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -138,6 +138,7 @@ def create_tf_cnn(): def save_torch_cnn(path, file_name): n = PyTorchNet() + n.eval() example_forward_input = torch.rand(1, 1, 28, 28) module = torch.jit.trace(n, example_forward_input) torch.jit.save(module, path + "/" + file_name) @@ -858,3 +859,83 @@ def test_inconsistent_params_db_model(): ex.value.args[0] == "Cannot set devices_per_node>1 if CPU is specified under devices" ) + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +def test_db_model_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): + """Test DBModels on remote DB, with an ensemble""" + + # Set experiment name + exp_name = "test-db-model-ensemble-duplicate" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = 1 # TF backend fails on multiple GPUs + + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + + # Create the SmartSim Experiment + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Create RunSettings + run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings.set_nodes(1) + run_settings.set_tasks(1) + + # Create ensemble + smartsim_ensemble = exp.create_ensemble( + "smartsim_ensemble", run_settings=run_settings, replicas=2 + ) + + # Create Model + smartsim_model = exp.create_model("smartsim_model", run_settings) + + # Create and save ML model to filesystem + model, inputs, outputs = create_tf_cnn() + model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + + # Add the first ML model to all of the ensemble members + smartsim_ensemble.add_ml_model( + "cnn", + "TF", + model=model, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=inputs, + outputs=outputs, + ) + + # Attempt to add a duplicate ML model to Ensemble via Ensemble.add_ml_model() + with pytest.raises(SSUnsupportedError) as ex: + smartsim_ensemble.add_ml_model( + "cnn", + "TF", + model=model, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=inputs, + outputs=outputs, + ) + assert ex.value.args[0] == 'An ML Model with name "cnn" already exists' + + # Add same name ML model to a new SmartSim Model + smartsim_model.add_ml_model( + "cnn", + "TF", + model_path=model_file2, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=inputs2, + outputs=outputs2, + ) + + # Attempt to add a duplicate ML model to Ensemble via Ensemble.add_model() + with pytest.raises(SSUnsupportedError) as ex: + smartsim_ensemble.add_model(smartsim_model) + assert ex.value.args[0] == 'An ML Model with name "cnn" already exists' diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index e6cacd4d01..2bffd1da69 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -623,3 +623,107 @@ def test_inconsistent_params_db_script(fileutils): ex.value.args[0] == "Cannot set first_device>0 if CPU is specified under devices" ) + + +@pytest.mark.skipif(not should_run, reason="Test needs Torch to run") +def test_db_script_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): + """Test DB scripts on remote DB""" + + # Set experiment name + exp_name = "test-db-script" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + torch_script = fileutils.get_test_conf_path("torchscript.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Create RunSettings + run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings.set_nodes(1) + run_settings.set_tasks(1) + + # Create Ensemble with two identical models + ensemble = exp.create_ensemble( + "dbscript_ensemble", run_settings=run_settings, replicas=2 + ) + + # Create SmartSim model + smartsim_model = exp.create_model("smartsim_model", run_settings) + # Create 2nd SmartSim model + smartsim_model_2 = exp.create_model("smartsim_model_2", run_settings) + # Create the script string + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + # Add the first ML script to all of the ensemble members + ensemble.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + # Attempt to add a duplicate ML model to Ensemble via Ensemble.add_script() + with pytest.raises(SSUnsupportedError) as ex: + ensemble.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + assert ex.value.args[0] == 'A Script with name "test_script1" already exists' + + # Add the first function to all of the ensemble members + ensemble.add_function( + "test_func", + function=timestwo, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + # Attempt to add a duplicate ML model to Ensemble via Ensemble.add_function() + with pytest.raises(SSUnsupportedError) as ex: + ensemble.add_function( + "test_func", + function=timestwo, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + assert ex.value.args[0] == 'A Script with name "test_func" already exists' + + # Add a script with a non-unique name to a SmartSim Model + smartsim_model.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + with pytest.raises(SSUnsupportedError) as ex: + ensemble.add_model(smartsim_model) + assert ex.value.args[0] == 'A Script with name "test_script1" already exists' + + # Add a function with a non-unique name to a SmartSim Model + smartsim_model_2.add_function( + "test_func", + function=timestwo, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + with pytest.raises(SSUnsupportedError) as ex: + ensemble.add_model(smartsim_model_2) + assert ex.value.args[0] == 'A Script with name "test_func" already exists' diff --git a/tests/backends/test_onnx.py b/tests/backends/test_onnx.py index 19c40017e2..7c0e97e414 100644 --- a/tests/backends/test_onnx.py +++ b/tests/backends/test_onnx.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py index 06c148a959..af04c89cb0 100644 --- a/tests/backends/test_tf.py +++ b/tests/backends/test_tf.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/backends/test_torch.py b/tests/backends/test_torch.py index 71a63adb91..76a989a2e8 100644 --- a/tests/backends/test_torch.py +++ b/tests/backends/test_torch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 2b7db11e18..c69b1746a5 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -58,8 +58,6 @@ def test_batch_model(fileutils, test_dir, wlmutils): batch_settings.set_account(wlmutils.get_test_account()) add_batch_resources(wlmutils, batch_settings) - if wlmutils.get_test_launcher() == "cobalt": - batch_settings.set_queue("debug-flat-quad") run_settings = wlmutils.get_run_settings("python", f"{script} --time=5") model = exp.create_model( "model", path=test_dir, run_settings=run_settings, batch_settings=batch_settings @@ -87,8 +85,6 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): add_batch_resources(wlmutils, batch) batch.set_account(wlmutils.get_test_account()) - if wlmutils.get_test_launcher() == "cobalt": - batch.set_queue("debug-flat-quad") ensemble = exp.create_ensemble("batch-ens", batch_settings=batch) ensemble.add_model(M1) ensemble.add_model(M2) @@ -110,12 +106,6 @@ def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): add_batch_resources(wlmutils, batch) batch.set_account(wlmutils.get_test_account()) - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - sleep(30) - batch.set_queue("debug-flat-quad") ensemble = exp.create_ensemble( "batch-ens-replicas", batch_settings=batch, run_settings=settings, replicas=2 ) diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index f1f5952b3c..058aef895d 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -60,9 +60,6 @@ def test_launch_orc_auto_batch(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:02:00") - if wlmutils.get_test_launcher() == "cobalt": - orc.batch_settings.set_queue("debug-flat-quad") - orc.set_path(test_dir) exp.start(orc, block=True) @@ -99,12 +96,6 @@ def test_launch_cluster_orc_batch_single(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:02:00") - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - time.sleep(120) - orc.batch_settings.set_queue("debug-flat-quad") orc.set_path(test_dir) exp.start(orc, block=True) @@ -141,12 +132,6 @@ def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:03:00") - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - time.sleep(120) - orc.batch_settings.set_queue("debug-flat-quad") orc.set_path(test_dir) exp.start(orc, block=True) @@ -178,12 +163,6 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:03:00") - if wlmutils.get_test_launcher() == "cobalt": - # As Cobalt won't allow us to run two - # jobs in the same debug queue, we need - # to make sure the previous test's one is over - time.sleep(120) - orc.batch_settings.set_queue("debug-flat-quad") exp.start(orc, block=True) diff --git a/tests/full_wlm/test_mpmd.py b/tests/full_wlm/test_mpmd.py index 18e918cfda..7f6cc2ea2b 100644 --- a/tests/full_wlm/test_mpmd.py +++ b/tests/full_wlm/test_mpmd.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -48,9 +48,9 @@ def test_mpmd(fileutils, test_dir, wlmutils): exp_name = "test-mpmd" launcher = wlmutils.get_test_launcher() # MPMD is supported in LSF, but the test for it is different - mpmd_supported = ["slurm", "pbs", "cobalt"] + mpmd_supported = ["slurm", "pbs"] if launcher not in mpmd_supported: - pytest.skip("Test requires Slurm, PBS, or Cobalt to run") + pytest.skip("Test requires Slurm, or PBS to run") # aprun returns an error if the launched app is not an MPI exec # as we do not want to add mpi4py as a dependency, we prefer to @@ -58,7 +58,6 @@ def test_mpmd(fileutils, test_dir, wlmutils): by_launcher = { "slurm": ["srun", "mpirun"], "pbs": ["mpirun"], - "cobalt": ["mpirun"], } exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) diff --git a/tests/full_wlm/test_slurm_allocation.py b/tests/full_wlm/test_slurm_allocation.py index 01d40bf2f3..95de1f4260 100644 --- a/tests/full_wlm/test_slurm_allocation.py +++ b/tests/full_wlm/test_slurm_allocation.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -36,6 +36,29 @@ pytestmark = pytest.mark.skip(reason="Test is only for Slurm WLM systems") +def test_invalid_time_format(wlmutils): + """test slurm interface for formatting walltimes""" + account = wlmutils.get_test_account() + with pytest.raises(ValueError) as e: + alloc = slurm.get_allocation(nodes=1, time="000500", account=account) + assert ( + "Input time must be formatted as `HH:MM:SS` with valid Integers." + in e.value.args[0] + ) + with pytest.raises(ValueError) as e: + alloc = slurm.get_allocation(nodes=1, time="00-05-00", account=account) + assert ( + "Input time must be formatted as `HH:MM:SS` with valid Integers." + in e.value.args[0] + ) + with pytest.raises(ValueError) as e: + alloc = slurm.get_allocation(nodes=1, time="TE:HE:HE", account=account) + assert ( + "Input time must be formatted as `HH:MM:SS` with valid Integers." + in e.value.args[0] + ) + + def test_get_release_allocation(wlmutils): """test slurm interface for obtaining allocations""" account = wlmutils.get_test_account() diff --git a/tests/full_wlm/test_wlm_helper_functions.py b/tests/full_wlm/test_wlm_helper_functions.py index 452d104197..5723939f53 100644 --- a/tests/full_wlm/test_wlm_helper_functions.py +++ b/tests/full_wlm/test_wlm_helper_functions.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_buildenv.py b/tests/install/test_buildenv.py similarity index 98% rename from tests/test_buildenv.py rename to tests/install/test_buildenv.py index d362ca1dd5..21b9a49b82 100644 --- a/tests/test_buildenv.py +++ b/tests/install/test_buildenv.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/install/test_builder.py b/tests/install/test_builder.py new file mode 100644 index 0000000000..5e6c8e597b --- /dev/null +++ b/tests/install/test_builder.py @@ -0,0 +1,364 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import functools +import pathlib +import platform +import threading +import time + +import pytest + +import smartsim._core._install.builder as build +from smartsim._core._install.buildenv import RedisAIVersion + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +RAI_VERSIONS = RedisAIVersion("1.2.7") + +for_each_device = pytest.mark.parametrize("device", ["cpu", "gpu"]) + +_toggle_build_optional_backend = lambda backend: pytest.mark.parametrize( + f"build_{backend}", + [ + pytest.param(switch, id=f"with{'' if switch else 'out'}-{backend}") + for switch in (True, False) + ], +) +toggle_build_tf = _toggle_build_optional_backend("tf") +toggle_build_pt = _toggle_build_optional_backend("pt") +toggle_build_ort = _toggle_build_optional_backend("ort") + + +@pytest.mark.parametrize( + "mock_os", [pytest.param(os_, id=f"os='{os_}'") for os_ in ("Windows", "Java", "")] +) +def test_os_enum_raises_on_unsupported(mock_os): + with pytest.raises(build.BuildError, match="operating system") as err_info: + build.OperatingSystem.from_str(mock_os) + + +@pytest.mark.parametrize( + "mock_arch", + [ + pytest.param(arch_, id=f"arch='{arch_}'") + for arch_ in ("i386", "i686", "i86pc", "aarch64", "armv7l", "") + ], +) +def test_arch_enum_raises_on_unsupported(mock_arch): + with pytest.raises(build.BuildError, match="architecture"): + build.Architecture.from_str(mock_arch) + + +@pytest.fixture +def p_test_dir(test_dir): + yield pathlib.Path(test_dir).resolve() + + +@for_each_device +def test_rai_builder_raises_if_attempting_to_place_deps_when_build_dir_dne( + monkeypatch, p_test_dir, device +): + monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) + monkeypatch.setattr( + build.RedisAIBuilder, + "rai_build_path", + property(lambda self: p_test_dir / "path/to/dir/that/dne"), + ) + rai_builder = build.RedisAIBuilder() + with pytest.raises(build.BuildError, match=r"build directory not found"): + rai_builder._fetch_deps_for(device) + + +@for_each_device +def test_rai_builder_raises_if_attempting_to_place_deps_in_nonempty_dir( + monkeypatch, p_test_dir, device +): + (p_test_dir / "some_file.txt").touch() + monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) + monkeypatch.setattr( + build.RedisAIBuilder, "rai_build_path", property(lambda self: p_test_dir) + ) + monkeypatch.setattr( + build.RedisAIBuilder, "get_deps_dir_path_for", lambda *a, **kw: p_test_dir + ) + rai_builder = build.RedisAIBuilder() + + with pytest.raises(build.BuildError, match=r"is not empty"): + rai_builder._fetch_deps_for(device) + + +invalid_build_arm64 = [ + dict(build_tf=True, build_onnx=True), + dict(build_tf=False, build_onnx=True), + dict(build_tf=True, build_onnx=False), +] +invalid_build_ids = [ + ",".join([f"{key}={value}" for key, value in d.items()]) + for d in invalid_build_arm64 +] + + +@pytest.mark.parametrize("build_options", invalid_build_arm64, ids=invalid_build_ids) +def test_rai_builder_raises_if_unsupported_deps_on_arm64(build_options): + with pytest.raises(build.BuildError, match=r"are not supported on.*ARM64"): + build.RedisAIBuilder( + _os=build.OperatingSystem.DARWIN, + architecture=build.Architecture.ARM64, + **build_options, + ) + + +def _confirm_inst_presence(type_, should_be_present, seq): + expected_num_occurrences = 1 if should_be_present else 0 + occurrences = filter(lambda item: isinstance(item, type_), seq) + return expected_num_occurrences == len(tuple(occurrences)) + + +# Helper functions to check for the presence (or absence) of a +# ``_RAIBuildDependency`` dependency in a list of dependencies that need to be +# fetched by a ``RedisAIBuilder`` instance +dlpack_dep_presence = functools.partial( + _confirm_inst_presence, build._DLPackRepository, True +) +pt_dep_presence = functools.partial(_confirm_inst_presence, build._PTArchive) +tf_dep_presence = functools.partial(_confirm_inst_presence, build._TFArchive) +ort_dep_presence = functools.partial(_confirm_inst_presence, build._ORTArchive) + + +@for_each_device +@toggle_build_tf +@toggle_build_pt +@toggle_build_ort +def test_rai_builder_will_add_dep_if_backend_requested_wo_duplicates( + monkeypatch, device, build_tf, build_pt, build_ort +): + monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) + + rai_builder = build.RedisAIBuilder( + build_tf=build_tf, build_torch=build_pt, build_onnx=build_ort + ) + requested_backends = rai_builder._get_deps_to_fetch_for(device) + assert dlpack_dep_presence(requested_backends) + assert tf_dep_presence(build_tf, requested_backends) + assert pt_dep_presence(build_pt, requested_backends) + assert ort_dep_presence(build_ort, requested_backends) + + +@for_each_device +@toggle_build_tf +@toggle_build_pt +def test_rai_builder_will_not_add_dep_if_custom_dep_path_provided( + monkeypatch, device, p_test_dir, build_tf, build_pt +): + monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) + mock_ml_lib = p_test_dir / "some/ml/lib" + mock_ml_lib.mkdir(parents=True) + rai_builder = build.RedisAIBuilder( + build_tf=build_tf, + build_torch=build_pt, + build_onnx=False, + libtf_dir=str(mock_ml_lib if build_tf else ""), + torch_dir=str(mock_ml_lib if build_pt else ""), + ) + requested_backends = rai_builder._get_deps_to_fetch_for(device) + assert dlpack_dep_presence(requested_backends) + assert tf_dep_presence(False, requested_backends) + assert pt_dep_presence(False, requested_backends) + assert ort_dep_presence(False, requested_backends) + assert len(requested_backends) == 1 + + +def test_rai_builder_raises_if_it_fetches_an_unexpected_number_of_ml_deps( + monkeypatch, p_test_dir +): + monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) + monkeypatch.setattr( + build.RedisAIBuilder, "rai_build_path", property(lambda self: p_test_dir) + ) + monkeypatch.setattr( + build, + "_place_rai_dep_at", + lambda target, verbose: lambda dep: target + / "whoops_all_ml_deps_extract_to_a_dir_with_this_name", + ) + rai_builder = build.RedisAIBuilder(build_tf=True, build_torch=True, build_onnx=True) + with pytest.raises( + build.BuildError, + match=r"Expected to place \d+ dependencies, but only found \d+", + ): + rai_builder._fetch_deps_for("cpu") + + +def test_threaded_map(): + def _some_io_op(x): + return x * x + + assert (0, 1, 4, 9, 16) == tuple(build._threaded_map(_some_io_op, range(5))) + + +def test_threaded_map_returns_early_if_nothing_to_map(): + sleep_duration = 60 + + def _some_long_io_op(_): + time.sleep(sleep_duration) + + start = time.time() + build._threaded_map(_some_long_io_op, []) + end = time.time() + assert end - start < sleep_duration + + +def test_correct_pt_variant_os(): + # Check that all Linux variants return Linux + for linux_variant in build.OperatingSystem.LINUX.value: + os_ = build.OperatingSystem.from_str(linux_variant) + assert build._choose_pt_variant(os_) == build._PTArchiveLinux + + # Check that ARM64 and X86_64 Mac OSX return the Mac variant + all_archs = (build.Architecture.ARM64, build.Architecture.X64) + for arch in all_archs: + os_ = build.OperatingSystem.DARWIN + assert build._choose_pt_variant(os_) == build._PTArchiveMacOSX + + +def test_PTArchiveMacOSX_url(): + arch = build.Architecture.X64 + pt_version = RAI_VERSIONS.torch + + pt_linux_cpu = build._PTArchiveLinux(build.Architecture.X64, "cpu", pt_version) + x64_prefix = "https://download.pytorch.org/libtorch/" + assert x64_prefix in pt_linux_cpu.url + + pt_macosx_cpu = build._PTArchiveMacOSX(build.Architecture.ARM64, "cpu", pt_version) + arm64_prefix = "https://github.com/CrayLabs/ml_lib_builder/releases/download/" + assert arm64_prefix in pt_macosx_cpu.url + + +def test_PTArchiveMacOSX_gpu_error(): + with pytest.raises(build.BuildError, match="support GPU on Mac OSX"): + build._PTArchiveMacOSX(build.Architecture.ARM64, "gpu", RAI_VERSIONS.torch).url + + +def test_valid_platforms(): + assert build.RedisAIBuilder( + _os=build.OperatingSystem.LINUX, + architecture=build.Architecture.X64, + build_tf=True, + build_torch=True, + build_onnx=True, + ) + assert build.RedisAIBuilder( + _os=build.OperatingSystem.DARWIN, + architecture=build.Architecture.X64, + build_tf=True, + build_torch=True, + build_onnx=False, + ) + assert build.RedisAIBuilder( + _os=build.OperatingSystem.DARWIN, + architecture=build.Architecture.X64, + build_tf=False, + build_torch=True, + build_onnx=False, + ) + + +@pytest.mark.parametrize( + "plat,cmd,expected_cmd", + [ + # Bare Word + pytest.param( + build.Platform(build.OperatingSystem.LINUX, build.Architecture.X64), + ["git", "clone", "my-repo"], + ["git", "clone", "my-repo"], + id="git-Linux-X64", + ), + pytest.param( + build.Platform(build.OperatingSystem.LINUX, build.Architecture.ARM64), + ["git", "clone", "my-repo"], + ["git", "clone", "my-repo"], + id="git-Linux-Arm64", + ), + pytest.param( + build.Platform(build.OperatingSystem.DARWIN, build.Architecture.X64), + ["git", "clone", "my-repo"], + ["git", "clone", "my-repo"], + id="git-Darwin-X64", + ), + pytest.param( + build.Platform(build.OperatingSystem.DARWIN, build.Architecture.ARM64), + ["git", "clone", "my-repo"], + [ + "git", + "clone", + "--config", + "core.autocrlf=false", + "--config", + "core.eol=lf", + "my-repo", + ], + id="git-Darwin-Arm64", + ), + # Abs path + pytest.param( + build.Platform(build.OperatingSystem.LINUX, build.Architecture.X64), + ["/path/to/git", "clone", "my-repo"], + ["/path/to/git", "clone", "my-repo"], + id="Abs-Linux-X64", + ), + pytest.param( + build.Platform(build.OperatingSystem.LINUX, build.Architecture.ARM64), + ["/path/to/git", "clone", "my-repo"], + ["/path/to/git", "clone", "my-repo"], + id="Abs-Linux-Arm64", + ), + pytest.param( + build.Platform(build.OperatingSystem.DARWIN, build.Architecture.X64), + ["/path/to/git", "clone", "my-repo"], + ["/path/to/git", "clone", "my-repo"], + id="Abs-Darwin-X64", + ), + pytest.param( + build.Platform(build.OperatingSystem.DARWIN, build.Architecture.ARM64), + ["/path/to/git", "clone", "my-repo"], + [ + "/path/to/git", + "clone", + "--config", + "core.autocrlf=false", + "--config", + "core.eol=lf", + "my-repo", + ], + id="Abs-Darwin-Arm64", + ), + ], +) +def test_git_commands_are_configered_correctly_for_platforms(plat, cmd, expected_cmd): + assert build.config_git_command(plat, cmd) == expected_cmd diff --git a/tests/on_wlm/test_base_settings_on_wlm.py b/tests/on_wlm/test_base_settings_on_wlm.py index d75cc635f0..0b31eedd2c 100644 --- a/tests/on_wlm/test_base_settings_on_wlm.py +++ b/tests/on_wlm/test_base_settings_on_wlm.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index fa05eb5130..8baf74bf41 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_containers_wlm.py b/tests/on_wlm/test_containers_wlm.py index 198a92f437..8dc4baae0f 100644 --- a/tests/on_wlm/test_containers_wlm.py +++ b/tests/on_wlm/test_containers_wlm.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index ab100d1a7e..6cf1c3918f 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_het_job.py b/tests/on_wlm/test_het_job.py index e8f20d1ee6..5a039a7c93 100644 --- a/tests/on_wlm/test_het_job.py +++ b/tests/on_wlm/test_het_job.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_launch_errors.py b/tests/on_wlm/test_launch_errors.py index 7281cb568c..905d96f549 100644 --- a/tests/on_wlm/test_launch_errors.py +++ b/tests/on_wlm/test_launch_errors.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_launch_ompi_lsf.py b/tests/on_wlm/test_launch_ompi_lsf.py index ed082d22e7..ed5de291b0 100644 --- a/tests/on_wlm/test_launch_ompi_lsf.py +++ b/tests/on_wlm/test_launch_ompi_lsf.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_local_step.py b/tests/on_wlm/test_local_step.py index 02e619ebfe..4e5f45e0b3 100644 --- a/tests/on_wlm/test_local_step.py +++ b/tests/on_wlm/test_local_step.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_restart.py b/tests/on_wlm/test_restart.py index b1997961d4..42bbe752c0 100644 --- a/tests/on_wlm/test_restart.py +++ b/tests/on_wlm/test_restart.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_simple_base_settings_on_wlm.py b/tests/on_wlm/test_simple_base_settings_on_wlm.py index 08bf875e21..1611781eb6 100644 --- a/tests/on_wlm/test_simple_base_settings_on_wlm.py +++ b/tests/on_wlm/test_simple_base_settings_on_wlm.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -50,10 +50,8 @@ def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() - if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: - pytest.skip( - "Test only runs on systems with LSF, PBSPro, Slurm, or Cobalt as WLM" - ) + if launcher not in ["pbs", "slurm", "lsf"]: + pytest.skip("Test only runs on systems with LSF, PBSPro, or Slurm as WLM") exp_name = "test-simplebase-settings-model-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) @@ -70,10 +68,8 @@ def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): def test_simple_model_stop_on_wlm(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() - if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: - pytest.skip( - "Test only runs on systems with LSF, PBSPro, Slurm, or Cobalt as WLM" - ) + if launcher not in ["pbs", "slurm", "lsf"]: + pytest.skip("Test only runs on systems with LSF, PBSPro, or Slurm as WLM") exp_name = "test-simplebase-settings-model-stop" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index a05d7be0f0..1ecc27442d 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_slurm_commands.py b/tests/on_wlm/test_slurm_commands.py index d3ebbcd317..8411be6e0a 100644 --- a/tests/on_wlm/test_slurm_commands.py +++ b/tests/on_wlm/test_slurm_commands.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_stop.py b/tests/on_wlm/test_stop.py index 383c6c4bdd..8d75d9f659 100644 --- a/tests/on_wlm/test_stop.py +++ b/tests/on_wlm/test_stop.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/on_wlm/test_wlm_orc_config_settings.py b/tests/on_wlm/test_wlm_orc_config_settings.py index 3de59075ea..f9ab60609b 100644 --- a/tests/on_wlm/test_wlm_orc_config_settings.py +++ b/tests/on_wlm/test_wlm_orc_config_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_alps_settings.py b/tests/test_alps_settings.py index 012f27fce1..b3c4c3bdb4 100644 --- a/tests/test_alps_settings.py +++ b/tests/test_alps_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_batch_settings.py b/tests/test_batch_settings.py index cb2096727e..db269a9b55 100644 --- a/tests/test_batch_settings.py +++ b/tests/test_batch_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_cli.py b/tests/test_cli.py index 899caa1e0f..710a9a6595 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -828,3 +828,33 @@ def test_cli_validation_test_execute( assert expected_stdout in caplog.text assert actual_retval == expected_retval + + +def test_validate_correctly_sets_and_restores_env(monkeypatch): + monkeypatch.setenv("FOO", "BAR") + monkeypatch.setenv("SPAM", "EGGS") + monkeypatch.delenv("TICK", raising=False) + monkeypatch.delenv("DNE", raising=False) + + assert os.environ["FOO"] == "BAR" + assert os.environ["SPAM"] == "EGGS" + assert "TICK" not in os.environ + assert "DNE" not in os.environ + + with smartsim._core._cli.validate._env_vars_set_to( + { + "FOO": "BAZ", # Redefine + "SPAM": None, # Delete + "TICK": "TOCK", # Add + "DNE": None, # Delete already missing + } + ): + assert os.environ["FOO"] == "BAZ" + assert "SPAM" not in os.environ + assert os.environ["TICK"] == "TOCK" + assert "DNE" not in os.environ + + assert os.environ["FOO"] == "BAR" + assert os.environ["SPAM"] == "EGGS" + assert "TICK" not in os.environ + assert "DNE" not in os.environ diff --git a/tests/test_cobalt_parser.py b/tests/test_cobalt_parser.py deleted file mode 100644 index e91c95100a..0000000000 --- a/tests/test_cobalt_parser.py +++ /dev/null @@ -1,54 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -from smartsim._core.launcher.cobalt import cobaltParser - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - - -def test_parse_step_id(): - output = "JobName JobId \n" "=====================\n" "smartsim 507975 \n" - step_id = cobaltParser.parse_cobalt_step_id(output, "smartsim") - assert step_id == "507975" - - -def test_parse_step_status(): - output = "JobName State \n" "=====================\n" "smartsim running \n" - step_id = cobaltParser.parse_cobalt_step_status(output, "smartsim") - assert step_id == "running" - - -def test_parse_qsub_out(): - output = ( - "Job routed to queue 'debug-flat-quad'.\n" - "Memory mode set to flat quad for queue debug-flat-quad\n" - "507998\n" - ) - step_id = cobaltParser.parse_qsub_out(output) - assert step_id == "507998" diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index e688017626..138ceb4b7f 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_colo_model_lsf.py b/tests/test_colo_model_lsf.py index e77eeedec2..5e1c449cca 100644 --- a/tests/test_colo_model_lsf.py +++ b/tests/test_colo_model_lsf.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_config.py b/tests/test_config.py index bbbb54526d..0716ac0d51 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -205,7 +205,7 @@ def test_redis_cli(): pytest.param("0", False, id="letter zero"), pytest.param("1", True, id="letter one"), pytest.param("-1", False, id="letter negative one"), - pytest.param(None, False, id="not in env"), + pytest.param(None, True, id="not in env"), ], ) def test_telemetry_flag( diff --git a/tests/test_configs/bad.py b/tests/test_configs/bad.py index 93e4864ffa..4efe8b9a19 100644 --- a/tests/test_configs/bad.py +++ b/tests/test_configs/bad.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/check_dirs.py b/tests/test_configs/check_dirs.py index 07b358d491..b817bde9ad 100644 --- a/tests/test_configs/check_dirs.py +++ b/tests/test_configs/check_dirs.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/cov/local_cov.cfg b/tests/test_configs/cov/local_cov.cfg index d25e9f83a6..481cc08c18 100644 --- a/tests/test_configs/cov/local_cov.cfg +++ b/tests/test_configs/cov/local_cov.cfg @@ -2,7 +2,6 @@ omit = *pbs* *slurm* - *cobalt* *mpirun* *alps* *lsf* @@ -39,7 +38,6 @@ exclude_lines= pragma: no cover cov-pbs cov-slurm - cov-cobalt cov-alps cov-mpirun cov-wlm @@ -49,6 +47,5 @@ exclude_lines= launcher == "slurm" launcher == "pbs" - launcher == "cobalt" launcher == "lsf" launcher == "pals" diff --git a/tests/test_configs/cov/lsf_cov.cfg b/tests/test_configs/cov/lsf_cov.cfg index 6e5f52eb4a..03b27c5ec7 100644 --- a/tests/test_configs/cov/lsf_cov.cfg +++ b/tests/test_configs/cov/lsf_cov.cfg @@ -2,7 +2,6 @@ omit = *slurm* *local* - *cobalt* *pbs* *alps* *redis_starter.py* @@ -36,11 +35,9 @@ exclude_lines= cov-slurm cov-local - cov-cobalt cov-alps cov-pbs pass launcher == "local" launcher == "slurm" - launcher == "cobalt" launcher == "pbs" diff --git a/tests/test_configs/cov/pbs_cov.cfg b/tests/test_configs/cov/pbs_cov.cfg index 99e7bcfd62..f9274cbf6d 100644 --- a/tests/test_configs/cov/pbs_cov.cfg +++ b/tests/test_configs/cov/pbs_cov.cfg @@ -2,7 +2,6 @@ omit = *slurm* *local* - *cobalt* *mpirun* *alps* *lsf* @@ -37,11 +36,9 @@ exclude_lines= cov-slurm cov-local - cov-cobalt cov-alps cov-lsf pass launcher == "local" launcher == "slurm" - launcher == "cobalt" launcher == "lsf" diff --git a/tests/test_configs/cov/slurm_cov.cfg b/tests/test_configs/cov/slurm_cov.cfg index 59405bc35b..5aa77cfbe1 100644 --- a/tests/test_configs/cov/slurm_cov.cfg +++ b/tests/test_configs/cov/slurm_cov.cfg @@ -2,7 +2,6 @@ omit = *pbs* *local* - *cobalt* *mpirun* *alps* *lsf* @@ -37,11 +36,9 @@ exclude_lines= cov-pbs cov-local - cov-cobalt cov-alps cov-lsf pass launcher == "local" launcher == "pbs" - launcher == "cobalt" launcher == "lsf" diff --git a/tests/test_configs/echo.py b/tests/test_configs/echo.py index 6523f4e4ff..5d9a57ebb5 100644 --- a/tests/test_configs/echo.py +++ b/tests/test_configs/echo.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/generator_files/circular_config/sub_dir/hello.sh b/tests/test_configs/generator_files/circular_config/sub_dir/hello.sh index 705c1dcc62..6d9be12142 100644 --- a/tests/test_configs/generator_files/circular_config/sub_dir/hello.sh +++ b/tests/test_configs/generator_files/circular_config/sub_dir/hello.sh @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/generator_files/multi_tags_template.sh b/tests/test_configs/generator_files/multi_tags_template.sh index 4fd79d3214..a7131e8927 100644 --- a/tests/test_configs/generator_files/multi_tags_template.sh +++ b/tests/test_configs/generator_files/multi_tags_template.sh @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/generator_files/test_dir/test.in b/tests/test_configs/generator_files/test_dir/test.in index 8a0a76ee28..f91f0256ce 100644 --- a/tests/test_configs/generator_files/test_dir/test.in +++ b/tests/test_configs/generator_files/test_dir/test.in @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/hw_mpi.c b/tests/test_configs/hw_mpi.c index 4cfa046319..995047bb04 100644 --- a/tests/test_configs/hw_mpi.c +++ b/tests/test_configs/hw_mpi.c @@ -1,7 +1,7 @@ /* * BSD 2-Clause License * - * Copyright (c) 2021-2023, Hewlett Packard Enterprise + * Copyright (c) 2021-2024, Hewlett Packard Enterprise * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/incoming_entities_reader.py b/tests/test_configs/incoming_entities_reader.py index c558271e30..32d670ed1b 100644 --- a/tests/test_configs/incoming_entities_reader.py +++ b/tests/test_configs/incoming_entities_reader.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/ml/training_service_torch.py b/tests/test_configs/ml/training_service_torch.py index 575940031e..2a6bac051f 100644 --- a/tests/test_configs/ml/training_service_torch.py +++ b/tests/test_configs/ml/training_service_torch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/mpi_impl_stubs/openmpi4/mpiexec b/tests/test_configs/mpi_impl_stubs/openmpi4/mpiexec index a5e1157d37..b8c34bb631 100755 --- a/tests/test_configs/mpi_impl_stubs/openmpi4/mpiexec +++ b/tests/test_configs/mpi_impl_stubs/openmpi4/mpiexec @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/mpi_impl_stubs/openmpi4/mpirun b/tests/test_configs/mpi_impl_stubs/openmpi4/mpirun index 9cfffde3da..fa1d3d0ac2 100755 --- a/tests/test_configs/mpi_impl_stubs/openmpi4/mpirun +++ b/tests/test_configs/mpi_impl_stubs/openmpi4/mpirun @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/mpi_impl_stubs/openmpi4/orterun b/tests/test_configs/mpi_impl_stubs/openmpi4/orterun index 66de764176..31f7dd9aa2 100755 --- a/tests/test_configs/mpi_impl_stubs/openmpi4/orterun +++ b/tests/test_configs/mpi_impl_stubs/openmpi4/orterun @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/mpi_impl_stubs/pals/mpiexec b/tests/test_configs/mpi_impl_stubs/pals/mpiexec index 70b27d1778..393ee2e3a6 100755 --- a/tests/test_configs/mpi_impl_stubs/pals/mpiexec +++ b/tests/test_configs/mpi_impl_stubs/pals/mpiexec @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/mpi_impl_stubs/slurm/mpiexec b/tests/test_configs/mpi_impl_stubs/slurm/mpiexec index 46fbacf58d..07ff9881d4 100755 --- a/tests/test_configs/mpi_impl_stubs/slurm/mpiexec +++ b/tests/test_configs/mpi_impl_stubs/slurm/mpiexec @@ -2,7 +2,7 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/reconnect_node.py b/tests/test_configs/reconnect_node.py index 3ff3d71ef7..1897529d6f 100644 --- a/tests/test_configs/reconnect_node.py +++ b/tests/test_configs/reconnect_node.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/reconnect_sim.py b/tests/test_configs/reconnect_sim.py index eda634517e..a8952406d3 100644 --- a/tests/test_configs/reconnect_sim.py +++ b/tests/test_configs/reconnect_sim.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/run_dbscript_smartredis.py b/tests/test_configs/run_dbscript_smartredis.py index b486b3fcd6..2fdab961b7 100644 --- a/tests/test_configs/run_dbscript_smartredis.py +++ b/tests/test_configs/run_dbscript_smartredis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/run_pt_dbmodel_smartredis.py b/tests/test_configs/run_pt_dbmodel_smartredis.py index 600ae2cb30..dd869c65a8 100644 --- a/tests/test_configs/run_pt_dbmodel_smartredis.py +++ b/tests/test_configs/run_pt_dbmodel_smartredis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/run_tf_dbmodel_smartredis.py b/tests/test_configs/run_tf_dbmodel_smartredis.py index 874b40c9cd..a760094018 100644 --- a/tests/test_configs/run_tf_dbmodel_smartredis.py +++ b/tests/test_configs/run_tf_dbmodel_smartredis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/send_data.py b/tests/test_configs/send_data.py index dd9194031a..f9b9440c47 100644 --- a/tests/test_configs/send_data.py +++ b/tests/test_configs/send_data.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/send_data_local_smartredis.py b/tests/test_configs/send_data_local_smartredis.py index 0c318736f2..34191bcca7 100644 --- a/tests/test_configs/send_data_local_smartredis.py +++ b/tests/test_configs/send_data_local_smartredis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/sleep.py b/tests/test_configs/sleep.py index 778b8946e6..d74d43bf10 100644 --- a/tests/test_configs/sleep.py +++ b/tests/test_configs/sleep.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_configs/torchscript.py b/tests/test_configs/torchscript.py index 1ec0e71d58..a90f165aaf 100644 --- a/tests/test_configs/torchscript.py +++ b/tests/test_configs/torchscript.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_containers.py b/tests/test_containers.py index e35b4f3095..21fe50ad4c 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_controller.py b/tests/test_controller.py index 65687ec596..1498727085 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index a40ccdf663..a02c17678d 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index ec0ed23ea0..227572ac97 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py index 4545e80bf1..0632eee16f 100644 --- a/tests/test_ensemble.py +++ b/tests/test_ensemble.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_entitylist.py b/tests/test_entitylist.py index 675e844264..89f56b7ab1 100644 --- a/tests/test_entitylist.py +++ b/tests/test_entitylist.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_experiment.py b/tests/test_experiment.py index c0185ab6d3..12b2f15798 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,6 +31,7 @@ from smartsim._core.config import CONFIG from smartsim.entity import Model from smartsim.error import SmartSimError +from smartsim.error.errors import SSUnsupportedError from smartsim.settings import RunSettings from smartsim.status import STATUS_NEVER_STARTED @@ -178,3 +179,8 @@ def test_enable_disable_telemtery(monkeypatch): assert CONFIG.telemetry_enabled exp.disable_telemetry() assert not CONFIG.telemetry_enabled + + +def test_error_on_cobalt(): + with pytest.raises(SSUnsupportedError): + exp = Experiment("cobalt_exp", launcher="cobalt") diff --git a/tests/test_generator.py b/tests/test_generator.py index e4618f9cd9..fd9a5b8363 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 784219f82e..025f53d32d 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_indirect.py b/tests/test_indirect.py index f8af882668..73f381441e 100644 --- a/tests/test_indirect.py +++ b/tests/test_indirect.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,9 +31,9 @@ import psutil import pytest +from smartsim._core.config import CONFIG from smartsim._core.entrypoints.indirect import cleanup, get_parser, get_ts, main from smartsim._core.utils.helpers import encode_cmd -from smartsim._core.utils.serialize import MANIFEST_FILENAME, TELMON_SUBDIR ALL_ARGS = { "+command", @@ -152,7 +152,7 @@ def test_indirect_main_dir_check(test_dir): cmd = ["echo", "unit-test"] encoded_cmd = encode_cmd(cmd) - status_path = exp_dir / TELMON_SUBDIR + status_path = exp_dir / CONFIG.telemetry_subdir # show that a missing status_path is created when missing main(encoded_cmd, "application", exp_dir, status_path) @@ -167,7 +167,7 @@ def test_indirect_main_cmd_check(capsys, test_dir, monkeypatch): captured = capsys.readouterr() # throw away existing output with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: ctx.setattr("smartsim._core.entrypoints.indirect.logger.error", print) - _ = main("", "application", exp_dir, exp_dir / TELMON_SUBDIR) + _ = main("", "application", exp_dir, exp_dir / CONFIG.telemetry_subdir) captured = capsys.readouterr() assert "Invalid cmd supplied" in ex.value.args[0] @@ -175,7 +175,8 @@ def test_indirect_main_cmd_check(capsys, test_dir, monkeypatch): # test with non-emptystring cmd with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: ctx.setattr("smartsim._core.entrypoints.indirect.logger.error", print) - _ = main(" \n \t ", "application", exp_dir, exp_dir / TELMON_SUBDIR) + status_dir = exp_dir / CONFIG.telemetry_subdir + _ = main(" \n \t ", "application", exp_dir, status_dir) captured = capsys.readouterr() assert "Invalid cmd supplied" in ex.value.args[0] @@ -190,13 +191,13 @@ def test_complete_process(fileutils, test_dir): raw_cmd = f"{sys.executable} {script} --time=1" cmd = encode_cmd(raw_cmd.split()) - rc = main(cmd, "application", exp_dir, exp_dir / TELMON_SUBDIR) + rc = main(cmd, "application", exp_dir, exp_dir / CONFIG.telemetry_subdir) assert rc == 0 assert exp_dir.exists() # NOTE: don't have a manifest so we're falling back to default event path - data_dir = exp_dir / TELMON_SUBDIR + data_dir = exp_dir / CONFIG.telemetry_subdir start_events = list(data_dir.rglob("start.json")) stop_events = list(data_dir.rglob("stop.json")) diff --git a/tests/test_init.py b/tests/test_init.py index 76f58b59ad..dfb58bd557 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_interrupt.py b/tests/test_interrupt.py index 73616a848d..28c48e0db2 100644 --- a/tests/test_interrupt.py +++ b/tests/test_interrupt.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index 51d8b60a60..0557f3cf40 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_local_launch.py b/tests/test_local_launch.py index e4d593b6f9..7befff95ed 100644 --- a/tests/test_local_launch.py +++ b/tests/test_local_launch.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_local_multi_run.py b/tests/test_local_multi_run.py index b6eaba56aa..576e290ca2 100644 --- a/tests/test_local_multi_run.py +++ b/tests/test_local_multi_run.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_local_restart.py b/tests/test_local_restart.py index b054011380..c59aebd7b2 100644 --- a/tests/test_local_restart.py +++ b/tests/test_local_restart.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_logs.py b/tests/test_logs.py new file mode 100644 index 0000000000..88c6a738f7 --- /dev/null +++ b/tests/test_logs.py @@ -0,0 +1,212 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import logging +import pathlib + +import pytest + +import smartsim +import smartsim.log +from smartsim import Experiment + +_CFG_TM_ENABLED_ATTR = "telemetry_enabled" + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + +@pytest.fixture +def turn_on_tm(monkeypatch): + monkeypatch.setattr( + smartsim._core.config.config.Config, + _CFG_TM_ENABLED_ATTR, + property(lambda self: True), + ) + yield + + +@pytest.mark.parametrize( + "level,expect_d,expect_i,expect_w,expect_e", + [ + pytest.param("DEBUG", True, False, False, False, id="debug-level"), + pytest.param("INFO", True, True, False, False, id="info-level"), + pytest.param("WARNING", True, True, True, False, id="warn-level"), + pytest.param("ERROR", True, True, True, True, id="err-level"), + ], +) +def test_lowpass_filter(level, expect_d, expect_i, expect_w, expect_e): + """Ensure that messages above maximum are not logged""" + log_filter = smartsim.log.LowPassFilter(level) + + faux_out_stream = io.StringIO() + handler = logging.StreamHandler(faux_out_stream) + handler.setFormatter(logging.Formatter("%(message)s")) + + logger = logging.getLogger(f"test_level_filter_{level}") + logger.addHandler(handler) + logger.addFilter(log_filter) + + logger.debug(str(logging.DEBUG)) + logger.info(str(logging.INFO)) + logger.warning(str(logging.WARNING)) + logger.exception(str(logging.ERROR)) + + logged_messages = faux_out_stream.getvalue().split("\n") + assert (str(logging.DEBUG) in logged_messages) == expect_d + assert (str(logging.INFO) in logged_messages) == expect_i + assert (str(logging.WARN) in logged_messages) == expect_w + assert (str(logging.ERROR) in logged_messages) == expect_e + + +def test_add_exp_loggers(test_dir): + """Ensure that expected loggers are added""" + # test_dir = fileutils.make_test_dir() + faux_out_stream = io.StringIO() + + logger = logging.getLogger("smartsim_test_add_exp_loggers") + logger.addHandler(logging.StreamHandler(faux_out_stream)) + + out_file = pathlib.Path(test_dir) / "smartsim.out" + err_file = pathlib.Path(test_dir) / "smartsim.err" + + filter_fn = lambda x: True + + smartsim.log.log_to_exp_file(str(out_file), logger, log_filter=filter_fn) + smartsim.log.log_to_exp_file(str(err_file), logger, "WARN") + + logger.debug("debug") + logger.exception("exception") + + assert out_file.exists() + assert out_file.is_file() + + assert err_file.exists() + assert err_file.is_file() + + +def test_get_logger(test_dir: str, turn_on_tm, monkeypatch): + """Ensure the correct logger type is instantiated""" + monkeypatch.setenv("SMARTSIM_LOG_LEVEL", "developer") + logger = smartsim.log.get_logger("SmartSimTest", "INFO") + assert isinstance(logger, smartsim.log.ContextAwareLogger) + + +@pytest.mark.parametrize( + "input_level,exp_level", + [ + pytest.param("INFO", "info", id="lowercasing only, INFO"), + pytest.param("info", "info", id="input back, info"), + pytest.param("WARNING", "warning", id="lowercasing only, WARNING"), + pytest.param("warning", "warning", id="input back, warning"), + pytest.param("QUIET", "warning", id="lowercasing only, QUIET"), + pytest.param("quiet", "warning", id="translation back, quiet"), + pytest.param("DEVELOPER", "debug", id="lowercasing only, DEVELOPER"), + pytest.param("developer", "debug", id="translation back, developer"), + ], +) +def test_translate_log_level(input_level: str, exp_level: str, turn_on_tm): + """Ensure the correct logger type is instantiated""" + translated_level = smartsim.log._translate_log_level(input_level) + assert exp_level == translated_level + + +def test_exp_logs(test_dir: str, turn_on_tm, monkeypatch): + """Ensure that experiment loggers are added when context info exists""" + monkeypatch.setenv("SMARTSIM_LOG_LEVEL", "developer") + test_dir = pathlib.Path(test_dir) + test_dir.mkdir(parents=True, exist_ok=True) + + token = smartsim.log.ctx_exp_path.set(test_dir) + + try: + logger = smartsim.log.get_logger("SmartSimTest", "INFO") + + faux_out_stream = io.StringIO() + logger.addHandler(logging.StreamHandler(faux_out_stream)) + + log_msg = "testing in a test!" + err_msg = "erroring in a test!" + logger.info(log_msg) + logger.error(err_msg) + + # ensure that the default stream is written to + logged = faux_out_stream.getvalue() + + assert log_msg in logged + assert err_msg in logged + + out_file, err_file = smartsim.log.get_exp_log_paths() + + out_content = out_file.read_text() + err_content = err_file.read_text() + + # ensure the low-pass filter logs non-errors to out file + assert log_msg in out_content + assert err_msg not in out_content + assert str(test_dir) in out_content + + # ensure the errors are logged to err file + assert err_msg in err_content + assert log_msg not in err_content + assert str(err_msg) in err_content + finally: + smartsim.log.ctx_exp_path.reset(token) + + +def test_context_leak(test_dir: str, turn_on_tm, monkeypatch): + """Ensure that exceptions do not leave the context in an invalid state""" + test_dir = pathlib.Path(test_dir) + test_dir.mkdir(parents=True, exist_ok=True) + + original_ctx_value = test_dir / pathlib.Path("some value") + ctx_var = smartsim.log.ctx_exp_path + token = ctx_var.set(original_ctx_value) + + err_msg = "some ex occurred in JobManager" + + def thrower(_self): + raise Exception(err_msg) + + try: + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.control.jobmanager.JobManager, "start", thrower) + exp = Experiment("MyExperiment", launcher="local", exp_path=str(test_dir)) + + sleep_rs = exp.create_run_settings("sleep", ["2"]) + sleep_rs.set_nodes(1) + sleep_rs.set_tasks(1) + + sleep = exp.create_model("SleepModel", sleep_rs) + exp.generate(sleep) + exp.start(sleep, block=True) + except Exception as ex: + assert err_msg in ex.args + finally: + assert ctx_var.get() == original_ctx_value + ctx_var.reset(token) + assert ctx_var.get() == "" diff --git a/tests/test_lsf_parser.py b/tests/test_lsf_parser.py index f41de54d8b..abd27eb5ae 100644 --- a/tests/test_lsf_parser.py +++ b/tests/test_lsf_parser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_lsf_settings.py b/tests/test_lsf_settings.py index a71d658cbf..fcb3516483 100644 --- a/tests/test_lsf_settings.py +++ b/tests/test_lsf_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_manifest.py b/tests/test_manifest.py index ea9920fad1..33fc6b1634 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,6 +40,7 @@ _LaunchedManifestMetadata as LaunchedManifestMetadata, ) from smartsim.database import Orchestrator +from smartsim.entity.dbobject import DBModel, DBScript from smartsim.error import SmartSimError from smartsim.settings import RunSettings @@ -61,6 +62,9 @@ orc_1.name = "orc2" model_no_name = exp.create_model(name=None, run_settings=rs) +db_script = DBScript("some-script", "def main():\n print('hello world')\n") +db_model = DBModel("some-model", "TORCH", b"some-model-bytes") + def test_separate(): manifest = Manifest(model, ensemble, orc) @@ -106,6 +110,38 @@ class Person: _ = Manifest(p) +@pytest.mark.parametrize( + "patch, has_db_objects", + [ + pytest.param((), False, id="No DB Objects"), + pytest.param((model, "_db_models", [db_model]), True, id="Model w/ DB Model"), + pytest.param( + (model, "_db_scripts", [db_script]), True, id="Model w/ DB Script" + ), + pytest.param( + (ensemble, "_db_models", [db_model]), True, id="Ensemble w/ DB Model" + ), + pytest.param( + (ensemble, "_db_scripts", [db_script]), True, id="Ensemble w/ DB Script" + ), + pytest.param( + (ensemble.entities[0], "_db_models", [db_model]), + True, + id="Ensemble Member w/ DB Model", + ), + pytest.param( + (ensemble.entities[0], "_db_scripts", [db_script]), + True, + id="Ensemble Member w/ DB Script", + ), + ], +) +def test_manifest_detects_db_objects(monkeypatch, patch, has_db_objects): + if patch: + monkeypatch.setattr(*patch) + assert Manifest(model, ensemble).has_db_objects == has_db_objects + + def test_launched_manifest_transform_data(): models = [(model, 1), (model_2, 2)] ensembles = [(ensemble, [(m, i) for i, m in enumerate(ensemble.entities)])] diff --git a/tests/test_model.py b/tests/test_model.py index 88700ad23c..a1b5ba505a 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_modelwriter.py b/tests/test_modelwriter.py index 4554a8b5a5..a857d7c5f0 100644 --- a/tests/test_modelwriter.py +++ b/tests/test_modelwriter.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_mpi_settings.py b/tests/test_mpi_settings.py index 4e0bc48c88..7d8db6e757 100644 --- a/tests/test_mpi_settings.py +++ b/tests/test_mpi_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_multidb.py b/tests/test_multidb.py index c4336294e6..af21f5a1e8 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_orc_config_settings.py b/tests/test_orc_config_settings.py index f08467be0d..3655964968 100644 --- a/tests/test_orc_config_settings.py +++ b/tests/test_orc_config_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 4a1b08367c..f87aa9331e 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -256,54 +256,6 @@ def test_orc_results_in_correct_number_of_shards(single_cmd): ) -###### Cobalt ###### - - -def test_cobalt_set_run_arg(wlmutils): - orc = Orchestrator( - wlmutils.get_test_port(), - db_nodes=3, - batch=False, - interface="lo", - launcher="cobalt", - run_command="aprun", - ) - orc.set_run_arg("account", "ACCOUNT") - assert all( - [db.run_settings.run_args["account"] == "ACCOUNT" for db in orc.entities] - ) - orc.set_run_arg("pes-per-numa-node", "2") - assert all( - ["pes-per-numa-node" not in db.run_settings.run_args for db in orc.entities] - ) - - -def test_cobalt_set_batch_arg(wlmutils): - orc = Orchestrator( - wlmutils.get_test_port(), - db_nodes=3, - batch=False, - interface="lo", - launcher="cobalt", - run_command="aprun", - ) - with pytest.raises(SmartSimError): - orc.set_batch_arg("account", "ACCOUNT") - - orc2 = Orchestrator( - wlmutils.get_test_port(), - db_nodes=3, - batch=True, - interface="lo", - launcher="cobalt", - run_command="aprun", - ) - orc2.set_batch_arg("account", "ACCOUNT") - assert orc2.batch_settings.batch_args["account"] == "ACCOUNT" - orc2.set_batch_arg("outputprefix", "new_output/") - assert "outputprefix" not in orc2.batch_settings.batch_args - - ###### LSF ###### diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index 2cd725f657..8bc23d14d0 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_pbs_parser.py b/tests/test_pbs_parser.py index 554780cd79..f77eb7c939 100644 --- a/tests/test_pbs_parser.py +++ b/tests/test_pbs_parser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_pbs_settings.py b/tests/test_pbs_settings.py index ed450bd82a..cefe3de4e3 100644 --- a/tests/test_pbs_settings.py +++ b/tests/test_pbs_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index 0faa922423..554e42cbd6 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_run_settings.py b/tests/test_run_settings.py index 7bcd6d8748..b9439f41af 100644 --- a/tests/test_run_settings.py +++ b/tests/test_run_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -97,7 +97,7 @@ def test_create_run_settings_local(): id=f"{l}/orterun", ), ) - for l in ("local", "pbs", "slurm", "lsf", "cobalt") + for l in ("local", "pbs", "slurm", "lsf") ) ), # Except for launchers that implement their own MPI settings diff --git a/tests/test_serialize.py b/tests/test_serialize.py index 167e7e445f..9e92a48668 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ from smartsim._core.utils import serialize from smartsim.database.orchestrator import Orchestrator -_REL_MANIFEST_PATH = f"{serialize.TELMON_SUBDIR}/{serialize.MANIFEST_FILENAME}" _CFG_TM_ENABLED_ATTR = "telemetry_enabled" # The tests in this file belong to the group_b group @@ -54,10 +53,14 @@ def turn_on_tm(monkeypatch): yield -def test_serialize_creates_a_manifest_json_file_if_dne(test_dir): +@pytest.fixture +def manifest_json(test_dir, config) -> str: + return Path(test_dir) / config.telemetry_subdir / serialize.MANIFEST_FILENAME + + +def test_serialize_creates_a_manifest_json_file_if_dne(test_dir, manifest_json): lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") serialize.save_launch_manifest(lmb.finalize()) - manifest_json = Path(test_dir) / _REL_MANIFEST_PATH assert manifest_json.is_file() with open(manifest_json, "r") as f: @@ -69,7 +72,7 @@ def test_serialize_creates_a_manifest_json_file_if_dne(test_dir): def test_serialize_does_not_write_manifest_json_if_telemetry_monitor_is_off( - test_dir, monkeypatch + test_dir, monkeypatch, manifest_json ): monkeypatch.setattr( smartsim._core.config.config.Config, @@ -78,12 +81,10 @@ def test_serialize_does_not_write_manifest_json_if_telemetry_monitor_is_off( ) lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") serialize.save_launch_manifest(lmb.finalize()) - manifest_json = Path(test_dir) / _REL_MANIFEST_PATH assert not manifest_json.exists() -def test_serialize_appends_a_manifest_json_exists(test_dir): - manifest_json = Path(test_dir) / _REL_MANIFEST_PATH +def test_serialize_appends_a_manifest_json_exists(test_dir, manifest_json): serialize.save_launch_manifest( LaunchedManifestBuilder("exp", test_dir, "launcher").finalize() ) @@ -102,8 +103,7 @@ def test_serialize_appends_a_manifest_json_exists(test_dir): assert len({run["run_id"] for run in manifest["runs"]}) == 3 -def test_serialize_overwites_file_if_not_json(test_dir): - manifest_json = Path(test_dir) / _REL_MANIFEST_PATH +def test_serialize_overwites_file_if_not_json(test_dir, manifest_json): manifest_json.parent.mkdir(parents=True, exist_ok=True) with open(manifest_json, "w") as f: f.write("This is not a json\n") @@ -114,10 +114,8 @@ def test_serialize_overwites_file_if_not_json(test_dir): assert isinstance(json.load(f), dict) -def test_started_entities_are_serialized(test_dir): +def test_started_entities_are_serialized(test_dir, manifest_json): exp_name = "test-exp" - test_dir = Path(test_dir) / exp_name - test_dir.mkdir(parents=True) exp = Experiment(exp_name, exp_path=str(test_dir), launcher="local") rs1 = exp.create_run_settings("echo", ["hello", "world"]) @@ -131,7 +129,6 @@ def test_started_entities_are_serialized(test_dir): exp.start(hello_world_model, spam_eggs_model, block=False) exp.start(hello_ensemble, block=False) - manifest_json = Path(exp.exp_path) / _REL_MANIFEST_PATH try: with open(manifest_json, "r") as f: manifest = json.load(f) diff --git a/tests/test_shell_util.py b/tests/test_shell_util.py index 7b7ac55b74..24f6b023ca 100644 --- a/tests/test_shell_util.py +++ b/tests/test_shell_util.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_slurm_get_alloc.py b/tests/test_slurm_get_alloc.py index 270bbf0140..aa12ce3626 100644 --- a/tests/test_slurm_get_alloc.py +++ b/tests/test_slurm_get_alloc.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -33,7 +33,7 @@ def test_get_alloc_format(): - time = "10:00:00" + time = "10:00:70" nodes = 5 account = "A35311" options = {"ntasks-per-node": 5} @@ -45,7 +45,7 @@ def test_get_alloc_format(): "-J", "SmartSim", "-t", - "10:00:00", + "10:01:10", "-A", "A35311", "--ntasks-per-node=5", diff --git a/tests/test_slurm_parser.py b/tests/test_slurm_parser.py index 30c6c5b31b..b5f7cf32ae 100644 --- a/tests/test_slurm_parser.py +++ b/tests/test_slurm_parser.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_slurm_settings.py b/tests/test_slurm_settings.py index d6bfd50636..aa5b2be115 100644 --- a/tests/test_slurm_settings.py +++ b/tests/test_slurm_settings.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_slurm_validation.py b/tests/test_slurm_validation.py index c3f796ba6c..02baddce6b 100644 --- a/tests/test_slurm_validation.py +++ b/tests/test_slurm_validation.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_smartredis.py b/tests/test_smartredis.py index 2f234c2179..282e708cc1 100644 --- a/tests/test_smartredis.py +++ b/tests/test_smartredis.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_step_info.py b/tests/test_step_info.py index eee9201924..ec589ae76a 100644 --- a/tests/test_step_info.py +++ b/tests/test_step_info.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py index 3f804b077d..ac3599d7d9 100644 --- a/tests/test_telemetry_monitor.py +++ b/tests/test_telemetry_monitor.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -93,12 +93,13 @@ def turn_on_tm(monkeypatch): yield -def snooze_nonblocking(test_dir: str, max_delay: int = 20, post_data_delay: int = 2): - telmon_subdir = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR +def snooze_nonblocking( + test_dir: pathlib.Path, max_delay: int = 20, post_data_delay: int = 2 +): # let the non-blocking experiment complete. for _ in range(max_delay): time.sleep(1) - if telmon_subdir.exists(): + if test_dir.exists(): time.sleep(post_data_delay) break @@ -179,7 +180,7 @@ def test_track_event( assert expected_output.is_file() -def test_load_manifest(fileutils: FileUtils, test_dir: str): +def test_load_manifest(fileutils: FileUtils, test_dir: str, config: cfg.Config): """Ensure that the runtime manifest loads correctly""" sample_manifest_path = fileutils.get_test_conf_path("telemetry/telemetry.json") sample_manifest = pathlib.Path(sample_manifest_path) @@ -187,7 +188,7 @@ def test_load_manifest(fileutils: FileUtils, test_dir: str): test_manifest_path = fileutils.make_test_file( serialize.MANIFEST_FILENAME, - pathlib.Path(test_dir) / serialize.TELMON_SUBDIR, + pathlib.Path(test_dir) / config.telemetry_subdir, sample_manifest.read_text(), ) test_manifest = pathlib.Path(test_manifest_path) @@ -431,7 +432,7 @@ def is_alive(self) -> bool: assert observer.stop_count == 1 -def test_telemetry_single_model(fileutils, test_dir, wlmutils): +def test_telemetry_single_model(fileutils, test_dir, wlmutils, config): """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp with unique db_identifiers""" @@ -446,7 +447,7 @@ def test_telemetry_single_model(fileutils, test_dir, wlmutils): exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create run settings - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -456,7 +457,7 @@ def test_telemetry_single_model(fileutils, test_dir, wlmutils): exp.start(smartsim_model, block=True) assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -464,7 +465,9 @@ def test_telemetry_single_model(fileutils, test_dir, wlmutils): assert len(stop_events) == 1 -def test_telemetry_single_model_nonblocking(fileutils, test_dir, wlmutils, monkeypatch): +def test_telemetry_single_model_nonblocking( + fileutils, test_dir, wlmutils, monkeypatch, config +): """Ensure that the telemetry monitor logs exist when the experiment is non-blocking""" with monkeypatch.context() as ctx: @@ -481,7 +484,7 @@ def test_telemetry_single_model_nonblocking(fileutils, test_dir, wlmutils, monke exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create run settings - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -490,11 +493,11 @@ def test_telemetry_single_model_nonblocking(fileutils, test_dir, wlmutils, monke exp.generate(smartsim_model) exp.start(smartsim_model) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -502,7 +505,7 @@ def test_telemetry_single_model_nonblocking(fileutils, test_dir, wlmutils, monke assert len(stop_events) == 1 -def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch): +def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch, config): """ Test telemetry with models being run in serial (one after each other) """ @@ -520,7 +523,7 @@ def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch): exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create run settings - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -534,7 +537,7 @@ def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch): [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] ) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -543,7 +546,7 @@ def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch): def test_telemetry_serial_models_nonblocking( - fileutils, test_dir, wlmutils, monkeypatch + fileutils, test_dir, wlmutils, monkeypatch, config ): """ Test telemetry with models being run in serial (one after each other) @@ -563,7 +566,7 @@ def test_telemetry_serial_models_nonblocking( exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create run settings - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -574,13 +577,13 @@ def test_telemetry_serial_models_nonblocking( exp.generate(*smartsim_models) exp.start(*smartsim_models) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) assert all( [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] ) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -588,7 +591,7 @@ def test_telemetry_serial_models_nonblocking( assert len(stop_events) == 5 -def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch): +def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config): """ Test telemetry with only a database running """ @@ -609,12 +612,14 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch): # create regular database orc = exp.create_database(port=test_port, interface=test_interface) exp.generate(orc) + + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir + try: exp.start(orc, block=True) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -622,7 +627,7 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch): assert len(stop_events) <= 1 finally: exp.stop(orc) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) assert exp.get_status(orc)[0] == STATUS_CANCELLED @@ -630,7 +635,7 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch): assert len(stop_events) == 1 -def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch): +def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, config): """ Test telemetry with only a non-generated database running """ @@ -651,13 +656,13 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch): # create regular database orc = exp.create_database(port=test_port, interface=test_interface) orc.set_path(test_dir) + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir try: exp.start(orc) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -666,14 +671,14 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch): finally: exp.stop(orc) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) assert exp.get_status(orc)[0] == STATUS_CANCELLED stop_events = list(telemetry_output_path.rglob("stop.json")) assert len(stop_events) == 1 -def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch): +def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch, config): """ Test telemetry with only a database and a model running """ @@ -700,7 +705,7 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch): exp.start(orc) # create run settings - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -711,13 +716,12 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch): finally: exp.stop(orc) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) assert exp.get_status(orc)[0] == STATUS_CANCELLED assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR - start_events = list(telemetry_output_path.rglob("database/**/start.json")) stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) @@ -730,7 +734,7 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch): assert len(stop_events) == 1 -def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch): +def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch, config): """ Test telemetry with only an ensemble """ @@ -748,7 +752,7 @@ def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch): # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - app_settings = exp.create_run_settings("python", test_script) + app_settings = exp.create_run_settings(sys.executable, test_script) app_settings.set_nodes(1) app_settings.set_tasks_per_node(1) @@ -757,8 +761,8 @@ def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch): exp.start(ens, block=True) assert all([status == STATUS_COMPLETED for status in exp.get_status(ens)]) - snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir + snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -766,7 +770,7 @@ def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch): assert len(stop_events) == 5 -def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch): +def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch, config): """ Test telemetry with only a colocated model running """ @@ -797,7 +801,7 @@ def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch): [status == STATUS_COMPLETED for status in exp.get_status(smartsim_model)] ) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -814,7 +818,9 @@ def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch): pytest.param(1, 15, id="15s shutdown"), ], ) -def test_telemetry_autoshutdown(test_dir, wlmutils, monkeypatch, frequency, cooldown): +def test_telemetry_autoshutdown( + test_dir, wlmutils, monkeypatch, frequency, cooldown, config +): """ Ensure that the telemetry monitor process shuts down after the desired cooldown period @@ -837,7 +843,7 @@ def test_telemetry_autoshutdown(test_dir, wlmutils, monkeypatch, frequency, cool stop_time = start_time exp.start(block=False) - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir empty_mani = list(telemetry_output_path.rglob("manifest.json")) assert len(empty_mani) == 1, "an manifest.json should be created" @@ -867,8 +873,8 @@ def get_launch_cmd(self): @pytest.fixture -def mock_step_meta_dict(test_dir): - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR +def mock_step_meta_dict(test_dir, config): + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir yield { "entity_type": "mock", "status_dir": telemetry_output_path, @@ -958,6 +964,7 @@ def test_multistart_experiment( test_dir: str, monkeypatch: pytest.MonkeyPatch, run_command: str, + config: cfg.Config, ): """Run an experiment with multiple start calls to ensure that telemetry is saved correctly for each run @@ -1016,7 +1023,7 @@ def test_multistart_experiment( assert tm_pid == exp._control._telemetry_monitor.pid time.sleep(3) # time for telmon to write db stop event - telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir db_start_events = list(telemetry_output_path.rglob("database/**/start.json")) db_stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) diff --git a/tutorials/getting_started/getting_started.ipynb b/tutorials/getting_started/getting_started.ipynb index a804435640..0a5230b0f5 100644 --- a/tutorials/getting_started/getting_started.ipynb +++ b/tutorials/getting_started/getting_started.ipynb @@ -36,7 +36,6 @@ "The `Experiment` also needs to have a `launcher` specified. Launchers provide SmartSim the ability to construct and execute complex workloads on HPC systems with schedulers (workload managers) like Slurm, or PBS. SmartSim currently supports\n", " * `slurm`\n", " * `pbs`\n", - " * `cobalt`\n", " * `lsf`\n", " * `local` (single node/laptops)\n", " * `auto`\n", @@ -809,7 +808,7 @@ "module = torch.jit.trace(net, example_forward_input)\n", "\n", "# Save the traced model to a file\n", - "torch.jit.save(module, \"./torch_cnn.pt\") " + "torch.jit.save(module, \"./torch_cnn.pt\")" ] }, { @@ -982,7 +981,7 @@ "source": [ "rs_prod = exp.create_run_settings(\"python\", f\"producer.py --redis-port {REDIS_PORT}\")\n", "ensemble = exp.create_ensemble(name=\"producer\",\n", - " replicas=2, \n", + " replicas=2,\n", " run_settings=rs_prod)" ] }, diff --git a/tutorials/ml_inference/Inference-in-SmartSim.ipynb b/tutorials/ml_inference/Inference-in-SmartSim.ipynb index 384c46d698..711ae999c9 100644 --- a/tutorials/ml_inference/Inference-in-SmartSim.ipynb +++ b/tutorials/ml_inference/Inference-in-SmartSim.ipynb @@ -74,7 +74,7 @@ "\n", "Build SmartSim dependencies (Redis, RedisAI, ML runtimes)\n", "\n", - "optional arguments:\n", + "options:\n", " -h, --help show this help message and exit\n", " -v Enable verbose build process\n", " --device {cpu,gpu} Device to build ML runtimes for\n", @@ -129,16 +129,16 @@ "\n", "ML Backends Requested\n", "╒════════════╤════════╤══════╕\n", - "│ PyTorch │ 1.11.0 │ \u001b[32mTrue\u001b[0m │\n", - "│ TensorFlow │ 2.8.0 │ \u001b[32mTrue\u001b[0m │\n", - "│ ONNX │ 1.11.0 │ \u001b[32mTrue\u001b[0m │\n", + "│ PyTorch │ 2.0.1 │ \u001b[32mTrue\u001b[0m │\n", + "│ TensorFlow │ 2.13.1 │ \u001b[32mTrue\u001b[0m │\n", + "│ ONNX │ 1.14.1 │ \u001b[32mTrue\u001b[0m │\n", "╘════════════╧════════╧══════╛\n", "\n", "Building for GPU support: \u001b[31mFalse\u001b[0m\n", "\n", "\u001b[34m[SmartSim]\u001b[0m \u001b[1;30mINFO\u001b[0m Building RedisAI version 1.2.7 from https://github.com/RedisAI/RedisAI.git/\n", "\u001b[34m[SmartSim]\u001b[0m \u001b[1;30mINFO\u001b[0m ML Backends and RedisAI build complete!\n", - "\u001b[34m[SmartSim]\u001b[0m \u001b[1;30mINFO\u001b[0m Tensorflow, Torch, Onnxruntime backend(s) built\n", + "\u001b[34m[SmartSim]\u001b[0m \u001b[1;30mINFO\u001b[0m Tensorflow, Onnxruntime, Torch backend(s) built\n", "\u001b[34m[SmartSim]\u001b[0m \u001b[1;30mINFO\u001b[0m SmartSim build complete!\n" ] } @@ -351,48 +351,46 @@ "name": "stdout", "output_type": "stream", "text": [ - "SmartRedis Library@23-56-41:WARNING: Environment variable SR_LOG_FILE is not set. Defaulting to stdout\n", - "SmartRedis Library@23-56-41:WARNING: Environment variable SR_LOG_LEVEL is not set. Defaulting to INFO\n", - "Prediction: [[-2.3274555 -2.3253717 -2.354757 -2.3729622 -2.3431003 -2.1907542\n", - " -2.3514638 -2.1824958 -2.3210742 -2.2772176]\n", - " [-2.319342 -2.3146112 -2.370425 -2.372699 -2.3437245 -2.1988375\n", - " -2.354674 -2.1797025 -2.3205185 -2.2724082]\n", - " [-2.316474 -2.3222082 -2.354598 -2.3659394 -2.3442194 -2.203955\n", - " -2.3561926 -2.1938426 -2.3158035 -2.2702417]\n", - " [-2.3319743 -2.311106 -2.356003 -2.3770962 -2.333499 -2.1953351\n", - " -2.3548756 -2.195049 -2.310809 -2.2787712]\n", - " [-2.3205962 -2.3178282 -2.3519592 -2.3816493 -2.3516834 -2.1981795\n", - " -2.3636622 -2.1777525 -2.3139138 -2.2705152]\n", - " [-2.3096914 -2.3222034 -2.3647196 -2.3790689 -2.3540542 -2.206103\n", - " -2.350227 -2.1878397 -2.3078933 -2.2638521]\n", - " [-2.3328648 -2.3219166 -2.3527567 -2.3824098 -2.3419397 -2.1949291\n", - " -2.3534136 -2.1831408 -2.31838 -2.2653728]\n", - " [-2.3125417 -2.324307 -2.3541815 -2.379772 -2.348383 -2.2018006\n", - " -2.3614779 -2.1773078 -2.322288 -2.2653532]\n", - " [-2.3261974 -2.3169107 -2.3658333 -2.372918 -2.3417373 -2.1894612\n", - " -2.3535395 -2.2018242 -2.308719 -2.268019 ]\n", - " [-2.316616 -2.3056076 -2.355318 -2.3717446 -2.346278 -2.1928883\n", - " -2.3632033 -2.2028553 -2.3090112 -2.2805274]\n", - " [-2.3209507 -2.3127859 -2.358682 -2.3774037 -2.3558414 -2.2000623\n", - " -2.3439143 -2.1920927 -2.3196788 -2.2638488]\n", - " [-2.3159695 -2.3109243 -2.356306 -2.374135 -2.3412004 -2.1999855\n", - " -2.3728766 -2.1851294 -2.3103416 -2.2791054]\n", - " [-2.320004 -2.3205712 -2.3569424 -2.3752837 -2.3463457 -2.1887283\n", - " -2.3645942 -2.1946917 -2.3067377 -2.272361 ]\n", - " [-2.310819 -2.3274822 -2.356091 -2.3715394 -2.3474889 -2.200722\n", - " -2.3434677 -2.1957805 -2.3201551 -2.2701602]\n", - " [-2.3143158 -2.31956 -2.358585 -2.362682 -2.3464782 -2.196579\n", - " -2.3578608 -2.2015376 -2.3066673 -2.2789493]\n", - " [-2.318907 -2.3225117 -2.3634868 -2.3806338 -2.344084 -2.1920872\n", - " -2.3534818 -2.1955805 -2.3039575 -2.2711294]\n", - " [-2.3084583 -2.3254113 -2.3642344 -2.3710778 -2.3496058 -2.192245\n", - " -2.3604536 -2.1796546 -2.310007 -2.286219 ]\n", - " [-2.3140576 -2.3124697 -2.359347 -2.379842 -2.3481016 -2.1948602\n", - " -2.3681424 -2.1851056 -2.3161757 -2.2693238]\n", - " [-2.3162746 -2.3137376 -2.3598473 -2.3751001 -2.3536685 -2.1899457\n", - " -2.3560162 -2.1918488 -2.3077402 -2.2818694]\n", - " [-2.3138344 -2.3119657 -2.3552136 -2.3767023 -2.3556495 -2.187487\n", - " -2.3484402 -2.1922355 -2.3236399 -2.2809098]]\n" + "Prediction: [[-2.1860428 -2.3318565 -2.2773128 -2.2742267 -2.2679536 -2.304159\n", + " -2.423439 -2.3406057 -2.2474668 -2.3950338]\n", + " [-2.1803837 -2.3286302 -2.2805855 -2.2874444 -2.261593 -2.3145547\n", + " -2.4357762 -2.3169715 -2.2618299 -2.3798223]\n", + " [-2.1833746 -2.3249795 -2.28497 -2.2851245 -2.2555952 -2.308204\n", + " -2.4274755 -2.3441646 -2.2553194 -2.3779805]\n", + " [-2.1843016 -2.3395848 -2.2619352 -2.294549 -2.2571433 -2.312943\n", + " -2.4161577 -2.338785 -2.2538524 -2.3881512]\n", + " [-2.1936755 -2.3315516 -2.2739122 -2.2832148 -2.2666094 -2.3038912\n", + " -2.4211216 -2.3300066 -2.2564852 -2.3846986]\n", + " [-2.1709712 -2.3271346 -2.280365 -2.286064 -2.2617233 -2.3227994\n", + " -2.4253702 -2.3313646 -2.2593162 -2.383301 ]\n", + " [-2.1948013 -2.3318067 -2.2713811 -2.2844 -2.2526758 -2.3178148\n", + " -2.4255004 -2.3233378 -2.2388031 -2.4088087]\n", + " [-2.17515 -2.3240736 -2.2818787 -2.2857373 -2.259629 -2.3184\n", + " -2.425821 -2.3519678 -2.2413275 -2.385761 ]\n", + " [-2.187554 -2.3335872 -2.2767708 -2.2818003 -2.2654893 -2.3097534\n", + " -2.4182632 -2.3376188 -2.2509694 -2.384327 ]\n", + " [-2.1793714 -2.340681 -2.271785 -2.287751 -2.2620957 -2.3163543\n", + " -2.4111845 -2.3468175 -2.2472064 -2.3842056]\n", + " [-2.1906679 -2.3483853 -2.2580595 -2.2923894 -2.25718 -2.2951608\n", + " -2.431815 -2.3487022 -2.2326546 -2.3963163]\n", + " [-2.1882055 -2.3293467 -2.2767649 -2.279892 -2.2527165 -2.3220086\n", + " -2.4226239 -2.3364902 -2.2455037 -2.394776 ]\n", + " [-2.1756573 -2.3318045 -2.2690601 -2.2737868 -2.264148 -2.3212118\n", + " -2.4243867 -2.3421402 -2.2562728 -2.390894 ]\n", + " [-2.1824148 -2.3317673 -2.2749603 -2.291667 -2.2524009 -2.3026595\n", + " -2.42986 -2.3290846 -2.265264 -2.387787 ]\n", + " [-2.1871543 -2.3408008 -2.2773213 -2.283908 -2.249834 -2.3159058\n", + " -2.4251873 -2.339211 -2.245001 -2.3839695]\n", + " [-2.1855574 -2.3216138 -2.2722392 -2.2826352 -2.2573392 -2.308948\n", + " -2.4348576 -2.3421624 -2.2397952 -2.4060655]\n", + " [-2.1876159 -2.330091 -2.2779942 -2.2849102 -2.2582757 -2.3122754\n", + " -2.4250498 -2.333003 -2.250753 -2.3871331]\n", + " [-2.182653 -2.3381891 -2.2795184 -2.287199 -2.2628696 -2.303869\n", + " -2.413879 -2.3404965 -2.26254 -2.3739154]\n", + " [-2.1733668 -2.3377435 -2.2724369 -2.28559 -2.2537165 -2.3127556\n", + " -2.4249415 -2.3484716 -2.2515364 -2.3897333]\n", + " [-2.1839535 -2.336417 -2.2839231 -2.285238 -2.2608624 -2.3198016\n", + " -2.424396 -2.3165755 -2.2433887 -2.3935702]]\n" ] } ], @@ -420,8 +418,8 @@ "source": [ "As we gave the CNN random noise, the predictions reflect that.\n", "\n", - "If running on CPU, be sure to change the argument in the ``set_model`` call\n", - "above to ``CPU``." + "If running on GPU, be sure to change the argument in the ``set_model`` call\n", + "above to ``device=\"GPU\"``." ] }, { @@ -468,46 +466,46 @@ "name": "stdout", "output_type": "stream", "text": [ - "U: [[[-0.550159 0.8065786 ]\n", - " [-0.52288723 -0.5346357 ]\n", - " [-0.6510868 -0.2521817 ]]\n", + "U: [[[-0.31189808 0.86989427]\n", + " [-0.48122275 -0.49140105]\n", + " [-0.81923395 -0.0425336 ]]\n", "\n", - " [[-0.17983183 -0.20003092]\n", - " [-0.5534476 -0.7888692 ]\n", - " [-0.81323797 0.58109635]]\n", + " [[-0.5889101 -0.29554686]\n", + " [-0.43949458 -0.66398275]\n", + " [-0.6782547 0.68686163]]\n", "\n", - " [[-0.20800859 0.42269117]\n", - " [-0.65485084 -0.7300564 ]\n", - " [-0.7265692 0.53698224]]\n", + " [[-0.61623317 0.05853765]\n", + " [-0.6667615 -0.5695148 ]\n", + " [-0.4191489 0.81989413]]\n", "\n", - " [[-0.336111 0.77894354]\n", - " [-0.31149226 0.43854192]\n", - " [-0.8888205 -0.44825 ]]\n", + " [[-0.5424681 0.8400398 ]\n", + " [-0.31990844 -0.2152339 ]\n", + " [-0.77678 -0.49800384]]\n", "\n", - " [[-0.6365824 0.7635661 ]\n", - " [-0.2663487 -0.08588188]\n", - " [-0.723755 -0.639993 ]]]\n", + " [[-0.43667376 0.8088193 ]\n", + " [-0.70812154 -0.57906115]\n", + " [-0.5548693 0.10246649]]]\n", "\n", - ", S: [[137.34267 54.616768]\n", - " [142.89323 35.937744]\n", - " [ 90.98083 48.821 ]\n", - " [ 86.74378 31.835794]\n", - " [146.14839 36.327038]]\n", + ", S: [[137.10924 25.710997]\n", + " [131.49983 37.79937 ]\n", + " [178.72423 24.792084]\n", + " [125.13014 49.733784]\n", + " [137.48834 53.57199 ]]\n", "\n", - ", V: [[[-0.48165366 0.8763617 ]\n", - " [-0.8763617 -0.48165366]]\n", + ", V: [[[-0.8333395 0.5527615 ]\n", + " [-0.5527615 -0.8333395 ]]\n", "\n", - " [[-0.47905296 0.8777859 ]\n", - " [-0.8777859 -0.47905296]]\n", + " [[-0.5085228 -0.8610485 ]\n", + " [-0.8610485 0.5085228 ]]\n", "\n", - " [[-0.737007 -0.67588514]\n", - " [-0.67588514 0.737007 ]]\n", + " [[-0.8650402 0.5017025 ]\n", + " [-0.5017025 -0.8650402 ]]\n", "\n", - " [[-0.28137407 0.9595981 ]\n", - " [-0.9595981 -0.28137407]]\n", + " [[-0.56953645 0.8219661 ]\n", + " [-0.8219661 -0.56953645]]\n", "\n", - " [[-0.5767642 -0.8169106 ]\n", - " [-0.8169106 0.5767642 ]]]\n", + " [[-0.6115895 0.79117525]\n", + " [-0.79117525 -0.6115895 ]]]\n", "\n" ] } @@ -594,8 +592,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "[[0.03525018 0.04472604 0.02831913 0.1114466 0.25944078 0.11165252\n", - " 0.2983908 0.04830809 0.02390536 0.03856055]]\n" + "[[0.05032112 0.06484107 0.03512685 0.14747524 0.14440396 0.02395445\n", + " 0.03395916 0.06222691 0.26738793 0.1703033 ]]\n" ] } ], @@ -657,8 +655,6 @@ "\n", "And PyTorch has its own converter.\n", "\n", - "Currently the ONNX backend only works on Linux, but MacOS support will be added in the future.\n", - "\n", "Below are some examples of a few models in [Scikit-learn](https://scikit-learn.org)\n", "that are converted into ONNX format for use with SmartSim. To use ONNX in SmartSim, specify\n", "`ONNX` as the argument for *backend* in the call to `client.set_model` or\n", @@ -801,15 +797,7 @@ "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23:56:50 C02G13RYMD6N SmartSim[33744] INFO Stopping model orchestrator_0 with job name orchestrator_0-CVIG02IVGHO0\n" - ] - } - ], + "outputs": [], "source": [ "exp.stop(db)" ] @@ -830,12 +818,12 @@ " Name Entity-Type JobID RunID Time Status Returncode \n", "\n", "\n", - "0 orchestrator_0DBNode 35628 0 29.7008Cancelled-9 \n", + "0 orchestrator_0DBNode 31857 0 32.7161Cancelled0 \n", "\n", "" ], "text/plain": [ - "'\\n\\n\\n\\n\\n\\n\\n
Name Entity-Type JobID RunID Time Status Returncode
0 orchestrator_0DBNode 35628 0 29.7008Cancelled-9
'" + "'\\n\\n\\n\\n\\n\\n\\n
Name Entity-Type JobID RunID Time Status Returncode
0 orchestrator_0DBNode 31857 0 32.7161Cancelled0
'" ] }, "execution_count": 19, @@ -844,7 +832,7 @@ } ], "source": [ - "exp.summary(format=\"html\")" + "exp.summary(style=\"html\")" ] }, { @@ -901,24 +889,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "23:56:50 C02G13RYMD6N SmartSim[33744] INFO \n", + "21:18:06 C02G13RYMD6N SmartSim[30945] INFO \n", "\n", "=== Launch Summary ===\n", "Experiment: Inference-Tutorial\n", - "Experiment Path: /Users/mrdro/repos/ssimdev/ss/tutorials/ml_inference/Inference-Tutorial\n", + "Experiment Path: /Users/smartsim/smartsim/tutorials/ml_inference/Inference-Tutorial\n", "Launcher: local\n", "Models: 1\n", "Database Status: inactive\n", "\n", "=== Models ===\n", "colocated_model\n", - "Executable: /Users/mrdro/miniconda3/envs/smartsim/bin/python\n", + "Executable: /Users/smartsim/venv/bin/python\n", "Executable Arguments: ./colo-db-torch-example.py\n", "Co-located Database: True\n", "\n", "\n", "\n", - "23:56:52 C02G13RYMD6N SmartSim[33744] INFO colocated_model(35666): Completed\n" + "21:18:09 C02G13RYMD6N SmartSim[30945] INFO colocated_model(31865): Completed\n" ] } ], @@ -942,13 +930,13 @@ " Name Entity-Type JobID RunID Time Status Returncode \n", "\n", "\n", - "0 orchestrator_0 DBNode 35628 0 29.7008Cancelled-9 \n", - "1 colocated_modelModel 35666 0 2.1590 Completed0 \n", + "0 orchestrator_0 DBNode 31857 0 32.7161Cancelled0 \n", + "1 colocated_modelModel 31865 0 3.5862 Completed0 \n", "\n", "" ], "text/plain": [ - "'\\n\\n\\n\\n\\n\\n\\n\\n
Name Entity-Type JobID RunID Time Status Returncode
0 orchestrator_0 DBNode 35628 0 29.7008Cancelled-9
1 colocated_modelModel 35666 0 2.1590 Completed0
'" + "'\\n\\n\\n\\n\\n\\n\\n\\n
Name Entity-Type JobID RunID Time Status Returncode
0 orchestrator_0 DBNode 31857 0 32.7161Cancelled0
1 colocated_modelModel 31865 0 3.5862 Completed0
'" ] }, "execution_count": 22, @@ -957,7 +945,7 @@ } ], "source": [ - "exp.summary(format=\"html\")" + "exp.summary(style=\"html\")" ] } ], @@ -977,7 +965,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.17" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/tutorials/online_analysis/lattice/online_analysis.ipynb b/tutorials/online_analysis/lattice/online_analysis.ipynb index 48ddf6032a..3389b11905 100644 --- a/tutorials/online_analysis/lattice/online_analysis.ipynb +++ b/tutorials/online_analysis/lattice/online_analysis.ipynb @@ -345,7 +345,7 @@ "source": [ "## Post-processing with TorchScript\n", "\n", - "We can upload [TorchScript functions](https://pytorch.org/docs/1.11/jit.html) to the DB. Tensors which are stored on the DB can be passed as arguments to uploaded functions and the results will be stored on the DB. This makes it possible to perform pre- and post-processing operations on tensors localli, *in the DB*, reducing the number of data transfers.\n", + "We can upload [TorchScript functions](https://pytorch.org/docs/2.0/jit.html) to the DB. Tensors which are stored on the DB can be passed as arguments to uploaded functions and the results will be stored on the DB. This makes it possible to perform pre- and post-processing operations on tensors localli, *in the DB*, reducing the number of data transfers.\n", "\n", "### Uploading a script\n", "We can load a file containing TorchScript-compatible functionsto the DB. For example, the file `./probe.script` contains the function `probe_points` which interpolates the values of `ux` and `uy` at some user-provided probe points. This is useful when we are interested in the value of a given fields only at specific locations.\n",