diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 6c1361b46c..e57994633d 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -32,108 +32,61 @@ on:
   release:
     types: [published]
 
-
-env:
-  HOMEBREW_NO_ANALYTICS: "ON" # Make Homebrew installation a little quicker
-  HOMEBREW_NO_AUTO_UPDATE: "ON"
-  HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON"
-  HOMEBREW_NO_GITHUB_API: "ON"
-  HOMEBREW_NO_INSTALL_CLEANUP: "ON"
-  CIBW_SKIP: "pp* *i686*" # skip building for PyPy
-  CIBW_ARCHS_MACOS: x86_64
-  CIBW_ARCHS_LINUX: x86_64  # ppc64le # uncomment to enable powerPC build
-  CIBW_ENVIRONMENT_MACOS: PATH="$(brew --prefix)/opt/make/libexec/gnubin:$PATH"
-  MACOSX_DEPLOYMENT_TARGET: "10.09"
-
-
 jobs:
-  build_wheels:
-    name: Build wheels on ${{ matrix.os }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-22.04, macos-12]
-
+  build_dists:
+    name: Build Distributions
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
 
-      - name: Install cibuildwheel
-        run: python -m pip install cibuildwheel>=2.12.3
+      - name: Install build
+        run: python -m pip install 'build>=1.2.2,<2'
 
       - name: Install build-essentials
-        if: contains(matrix.os, 'ubuntu')
         run: |
           sudo add-apt-repository ppa:ubuntu-toolchain-r/test
           sudo apt-get update
-          sudo apt-get install -y build-essential
-          sudo apt-get install -y wget
+          sudo apt-get install -y build-essential wget
 
-      - name: Install GNU make for MacOS
-        if: contains(matrix.os, 'macos')
-        run: brew install make || true
+      - name: Build Distributions
+        run: python -m build .
 
-      - name: list target wheels
-        run: |
-          python -m cibuildwheel . --print-build-identifiers
-
-      - name: Build wheels
-        run: python -m cibuildwheel --output-dir wheelhouse
-        env:
-          CIBW_ENVIRONMENT_MACOS: PATH="$(brew --prefix)/opt/make/libexec/gnubin:$PATH"
-          MACOSX_DEPLOYMENT_TARGET: "10.09"
-
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         with:
-          path: ./wheelhouse/*.whl
-
-
-  build_sdist:
-    name: Build source distribution
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        name: Install Python
-        with:
-          python-version: '3.9'
-
-      - name: Build sdist
-        run: |
-          python -m pip install cmake>=3.13
-          python setup.py sdist
-
-      - uses: actions/upload-artifact@v2
-        with:
-          path: dist/*.tar.gz
+          name: distributables
+          path: ./dist/*
 
 
   upload_pypi:
-    needs: [build_wheels, build_sdist]
-    runs-on: ubuntu-latest
+    needs: [build_dists]
+    runs-on: ubuntu-22.04
     steps:
-      - uses: actions/download-artifact@v2
+      - uses: actions/download-artifact@v3
         with:
-          name: artifact
+          name: distributables
           path: dist
 
       - uses: pypa/gh-action-pypi-publish@release/v1
         with:
           user: __token__
           password: ${{ secrets.PYPI }}
-          #repository_url: https://test.pypi.org/legacy/
-
+          # repository-url: https://test.pypi.org/legacy/
 
   createPullRequest:
-    runs-on: ubuntu-latest
+    needs: [upload_pypi]
+    runs-on: ubuntu-22.04
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
 
       - name: Create pull request
         run: |
-          gh pr create -B develop -H master --title 'Merge master into develop' --body 'This PR brings develop up to date with master for release.'
+          gh pr create -B develop \
+                       -H master \
+                       --title 'Merge master into develop' \
+                       --body 'This PR brings develop up to date with master for release.'
         env:
             GH_TOKEN: ${{ github.token }}
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 04d7e25ae1..ca6ec4cc40 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -49,12 +49,12 @@ env:
 
 jobs:
   run_tests:
-    name: Run tests ${{ matrix.subset }} with ${{ matrix.os }}, Python ${{ matrix.py_v}}, RedisAI ${{ matrix.rai }}
+    name: Run tests ${{ matrix.subset }} with ${{ matrix.os }}, Python ${{ matrix.py_v}}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
-        subset: [backends, slow_tests, group_a, group_b]
+        subset: [backends, slow_tests, group_a, group_b, dragon]
         os: [macos-12, macos-14, ubuntu-22.04] # Operating systems
         compiler: [8] # GNU compiler version
         rai: [1.2.7] # Redis AI versions
@@ -63,9 +63,6 @@ jobs:
           - os: macos-14
             py_v: "3.9"
 
-    env:
-      SMARTSIM_REDISAI: ${{ matrix.rai }}
-
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
@@ -117,15 +114,26 @@ jobs:
       - name: Install SmartSim (with ML backends)
         run: |
           python -m pip install git+https://github.com/CrayLabs/SmartRedis.git@develop#egg=smartredis
-          python -m pip install .[dev,mypy,ml]
+          python -m pip install .[dev,mypy]
 
-      - name: Install ML Runtimes with Smart (with pt, tf, and onnx support)
-        if: contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')
-        run: smart build --device cpu --onnx -v
+      - name: Install ML Runtimes
+        if: matrix.subset != 'dragon'
+        run: smart build --device cpu -v
 
-      - name: Install ML Runtimes with Smart (no ONNX,TF on Apple Silicon)
-        if: contains( matrix.os, 'macos-14' )
-        run: smart build --device cpu --no_tf -v
+     
+      - name: Install ML Runtimes (with dragon)
+        if: matrix.subset == 'dragon'
+        env:
+          SMARTSIM_DRAGON_TOKEN: ${{ secrets.DRAGON_TOKEN }}
+        run: |
+          if [ -n "${SMARTSIM_DRAGON_TOKEN}" ]; then
+            smart build --device cpu -v --dragon-repo dragonhpc/dragon-nightly --dragon-version 0.10
+          else
+            smart build --device cpu -v --dragon
+          fi
+          SP=$(python -c 'import site; print(site.getsitepackages()[0])')/smartsim/_core/config/dragon/.env
+          LLP=$(cat $SP | grep LD_LIBRARY_PATH | awk '{split($0, array, "="); print array[2]}')
+          echo "LD_LIBRARY_PATH=$LLP:$LD_LIBRARY_PATH" >> $GITHUB_ENV
 
       - name: Run mypy
         run: |
@@ -151,9 +159,16 @@ jobs:
           echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV
           py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ ./tests/backends
 
+      # Run pytest (dragon subtests)
+      - name: Run Dragon Pytest
+        if: (matrix.subset == 'dragon' && matrix.os == 'ubuntu-22.04')
+        run: |
+          echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV
+          dragon -s py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ -m ${{ matrix.subset }} ./tests
+
       # Run pytest (test subsets)
       - name: Run Pytest
-        if: "!contains(matrix.subset, 'backends')" # if not running backend tests
+        if: (matrix.subset != 'backends' && matrix.subset != 'dragon') # if not running backend tests or dragon tests
         run: |
           echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV
           py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ -m ${{ matrix.subset }} ./tests
diff --git a/.gitignore b/.gitignore
index 77b91d5865..97132aff7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@ tests/test_output
 # Dependencies
 smartsim/_core/.third-party
 smartsim/_core/.dragon
+smartsim/_core/build
 
 # Docs
 _build
diff --git a/.wci.yml b/.wci.yml
index 6194f19391..cf53334c3a 100644
--- a/.wci.yml
+++ b/.wci.yml
@@ -22,8 +22,8 @@
     language: Python
 
     release:
-      version: 0.7.0
-      date: 2024-05-14
+      version: 0.8.0
+      date: 2024-09-25
 
     documentation:
       general: https://www.craylabs.org/docs/overview.html
diff --git a/Makefile b/Makefile
index bddbda722b..b4ceef2194 100644
--- a/Makefile
+++ b/Makefile
@@ -150,11 +150,11 @@ tutorials-dev:
 	@docker compose build tutorials-dev
 	@docker run -p 8888:8888 smartsim-tutorials:dev-latest
 
-# help: tutorials-prod                 - Build and start a docker container to run the tutorials (v0.7.0)
+# help: tutorials-prod                 - Build and start a docker container to run the tutorials (v0.8.0)
 .PHONY: tutorials-prod
 tutorials-prod:
 	@docker compose build tutorials-prod
-	@docker run -p 8888:8888 smartsim-tutorials:v0.7.0
+	@docker run -p 8888:8888 smartsim-tutorials:v0.8.0
 
 
 # help:
@@ -164,22 +164,22 @@ tutorials-prod:
 # help: test                           - Run all tests
 .PHONY: test
 test:
-	@python -m pytest --ignore=tests/full_wlm/
+	@python -m pytest --ignore=tests/full_wlm/ --ignore=tests/dragon_wlm
 
 # help: test-verbose                   - Run all tests verbosely
 .PHONY: test-verbose
 test-verbose:
-	@python -m pytest -vv --ignore=tests/full_wlm/
+	@python -m pytest -vv --ignore=tests/full_wlm/ --ignore=tests/dragon_wlm
 
 # help: test-debug                     - Run all tests with debug output
 .PHONY: test-debug
 test-debug:
-	@SMARTSIM_LOG_LEVEL=developer python -m pytest -s -o log_cli=true -vv --ignore=tests/full_wlm/
+	@SMARTSIM_LOG_LEVEL=developer python -m pytest -s -o log_cli=true -vv --ignore=tests/full_wlm/ --ignore=tests/dragon_wlm
 
 # help: test-cov                       - Run all tests with coverage
 .PHONY: test-cov
 test-cov:
-	@python -m pytest -vv --cov=./smartsim --cov-config=${COV_FILE} --ignore=tests/full_wlm/
+	@python -m pytest -vv --cov=./smartsim --cov-config=${COV_FILE} --ignore=tests/full_wlm/ --ignore=tests/dragon_wlm
 
 
 # help: test-full                      - Run all WLM tests with Python coverage (full test suite)
@@ -192,3 +192,8 @@ test-full:
 .PHONY: test-wlm
 test-wlm:
 	@python -m pytest -vv tests/full_wlm/ tests/on_wlm
+
+# help: test-dragon                   - Run dragon-specific tests
+.PHONY: test-dragon
+test-dragon:
+	@dragon pytest tests/dragon_wlm
diff --git a/README.md b/README.md
index c0986042eb..610d6608c0 100644
--- a/README.md
+++ b/README.md
@@ -643,11 +643,11 @@ from C, C++, Fortran and Python with the SmartRedis Clients:
     <tr>
       <td rowspan="3">1.2.7</td>
       <td>PyTorch</td>
-      <td>2.0.1</td>
+      <td>2.1.0</td>
     </tr>
     <tr>
       <td>TensorFlow\Keras</td>
-      <td>2.13.1</td>
+      <td>2.15.0</td>
     </tr>
     <tr>
       <td>ONNX</td>
diff --git a/doc/_static/version_names.json b/doc/_static/version_names.json
index bc095f84af..8b127e5867 100644
--- a/doc/_static/version_names.json
+++ b/doc/_static/version_names.json
@@ -1,7 +1,8 @@
 {
     "version_names":[
         "develop (unstable)",
-        "0.7.0 (stable)",
+        "0.8.0 (stable)",
+        "0.7.0",
         "0.6.2",
         "0.6.1",
         "0.6.0",
@@ -15,6 +16,7 @@
     "version_urls": [
         "https://www.craylabs.org/develop/overview.html",
         "https://www.craylabs.org/docs/overview.html",
+        "https://www.craylabs.org/docs/versions/0.7.0/overview.html",
         "https://www.craylabs.org/docs/versions/0.6.2/overview.html",
         "https://www.craylabs.org/docs/versions/0.6.1/overview.html",
         "https://www.craylabs.org/docs/versions/0.6.0/overview.html",
diff --git a/doc/changelog.md b/doc/changelog.md
index 740197ce5d..752957bfdc 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -9,12 +9,80 @@ Jump to:
 
 ## SmartSim
 
-### Development branch
+### MLI branch
 
-To be released at some future point in time
+Description
+
+- Merge core refactor into MLI feature branch
+- Implement asynchronous notifications for shared data
+- Quick bug fix in _validate
+- Add helper methods to MLI classes
+- Update error handling for consistency
+- Parameterize installation of dragon package with `smart build`
+- Update docstrings
+- Filenames conform to snake case
+- Update SmartSim environment variables using new naming convention
+- Refactor `exception_handler`
+- Add RequestDispatcher and the possibility of batching inference requests
+- Enable hostname selection for dragon tasks
+- Remove pydantic dependency from MLI code
+- Update MLI environment variables using new naming convention
+- Reduce a copy by using torch.from_numpy instead of torch.tensor
+- Enable dynamic feature store selection
+- Fix dragon package installation bug
+- Adjust schemas for better performance
+- Add TorchWorker first implementation and mock inference app example
+- Add error handling in Worker Manager pipeline
+- Add EnvironmentConfigLoader for ML Worker Manager
+- Add Model schema with model metadata included
+- Removed device from schemas, MessageHandler and tests
+- Add ML worker manager, sample worker, and feature store
+- Add schemas and MessageHandler class for de/serialization of
+  inference requests and response messages
+
+
+### Develop
+
+To be released at some point in the future
+
+Description
+
+- Implement workaround for Tensorflow that allows RedisAI to build with GCC-14
+- Add instructions for installing SmartSim on PML's Scylla
+
+Detailed Notes
+
+- In libtensorflow, the input argument to TF_SessionRun seems to be mistyped to
+  TF_Output instead of TF_Input. These two types differ only in name. GCC-14
+  catches this and throws an error, even though earlier versions allow this. To
+  solve this problem, patches are applied to the Tensorflow backend in RedisAI.
+  Future versions of Tensorflow may fix this problem, but for now this seems to be
+  the best workaround.
+  ([SmartSim-PR738](https://github.com/CrayLabs/SmartSim/pull/738))
+- PML's Scylla is still under development. The usual SmartSim
+  build instructions do not apply because the GPU dependencies
+  have yet to be installed at a system-wide level. Scylla has
+  its own entry in the documentation.
+  ([SmartSim-PR733](https://github.com/CrayLabs/SmartSim/pull/733))
+
+
+### 0.8.0
+
+Released on 27 September, 2024
 
 Description
 
+- Add instructions for Frontier to set the MIOPEN cache
+- Refine Frontier documentation for proper use of miniforge3
+- Refactor to the RedisAI build to allow more flexibility in versions
+  and sources of ML backends
+- Add Dockerfiles with GPU support
+- Fine grain build support for GPUs
+- Update Torch to 2.1.0, Tensorflow to 2.15.0
+- Better error messages in build process
+- Allow specifying Model and Ensemble parameters with
+  number-like types (e.g. numpy types)
+- Pin watchdog to 4.x
 - Update codecov to 4.5.0
 - Remove build of Redis from setup.py
 - Mitigate dependency installation issues
@@ -30,6 +98,46 @@ Description
 
 Detailed Notes
 
+- On Frontier, the MIOPEN cache may need to be set prior to using
+  RedisAI in the ``smart validate``. The instructions for Frontier
+  have been updated accordingly.
+  ([SmartSim-PR727](https://github.com/CrayLabs/SmartSim/pull/727))
+- On Frontier, the recommended way to activate conda environments is
+  to go through source activate. This also means that ``conda init``
+  is not needed. The instructions for Frontier have been updated to
+  reflect this.
+  ([SmartSim-PR719](https://github.com/CrayLabs/SmartSim/pull/719))
+- The RedisAIBuilder class was completely overhauled to allow users to
+  express a wider range of support for hardware/software stacks. This
+  will be extended to support ROCm, CUDA-11, and CUDA-12.
+  ([SmartSim-PR669](https://github.com/CrayLabs/SmartSim/pull/669))
+- Versions for each of these packages are no longer specified in an
+  internal class. Instead a default set of JSON files specifies the
+  sources and versions. Users can specify their own custom specifications
+  at smart build time.
+  ([SmartSim-PR669](https://github.com/CrayLabs/SmartSim/pull/669))
+- Because all build configuration has been moved to static files and all
+  backends are compiled during `smart build`, SmartSim can now be shipped as a
+  pure python wheel.
+  ([SmartSim-PR728](https://github.com/CrayLabs/SmartSim/pull/728))
+- Two new Dockerfiles are now provided (one each for 11.8 and 12.1) that
+  can be used to build a container to run the tutorials. No HPC support
+  should be expected at this time
+  ([SmartSim-PR669](https://github.com/CrayLabs/SmartSim/pull/669))
+- As a result of the previous change, SmartSim now requires C++17 and a
+  minimum Cuda version of 11.8 in order to build Torch 2.1.0.
+  ([SmartSim-PR669](https://github.com/CrayLabs/SmartSim/pull/669))
+- Error messages were not being interpolated correctly. This has been
+  addressed to provide more context when exposing error messages to users.
+  ([SmartSim-PR669](https://github.com/CrayLabs/SmartSim/pull/669))
+- The serializer would fail if a parameter for a Model or Ensemble
+  was specified as a numpy dtype. The constructors for these
+  methods now validate that the input is number-like and convert
+  them to strings
+  ([SmartSim-PR676](https://github.com/CrayLabs/SmartSim/pull/676))
+- Pin watchdog to 4.x because v5 introduces new types and requires
+  updates to the type-checking
+  ([SmartSim-PR690](https://github.com/CrayLabs/SmartSim/pull/690))
 - Update codecov to 4.5.0 to mitigate GitHub action failure
   ([SmartSim-PR657](https://github.com/CrayLabs/SmartSim/pull/657))
 - The builder module was included in setup.py to allow us to ship the
diff --git a/doc/conf.py b/doc/conf.py
index 932bce0132..8f3a9ca632 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -29,7 +29,7 @@
     import smartsim
     version = smartsim.__version__
 except ImportError:
-    version = "0.7.0"
+    version = "0.8.0"
 
 # The full version, including alpha/beta/rc tags
 release = version
diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst
index 02c17e1fda..a5db285ca8 100644
--- a/doc/installation_instructions/basic.rst
+++ b/doc/installation_instructions/basic.rst
@@ -18,7 +18,7 @@ Prerequisites
 Basic
 =====
 
-The base prerequisites to install SmartSim and SmartRedis are:
+The base prerequisites to install SmartSim and SmartRedis wtih CPU-only support are:
 
   - Python 3.9-3.11
   - Pip
@@ -27,13 +27,11 @@ The base prerequisites to install SmartSim and SmartRedis are:
   - C++ compiler
   - GNU Make > 4.0
   - git
-  - `git-lfs`_
-
-.. _git-lfs: https://github.com/git-lfs/git-lfs?utm_source=gitlfs_site&utm_medium=installation_link&utm_campaign=gitlfs
 
 .. note::
 
-  GCC 5-9, 11, and 12 is recommended. There are known bugs with GCC 10.
+  GCC 9, 11-13 is recommended (here are known issues compiling with GCC 10). For
+  CUDA 11.8, GCC 9 or 11 must be used.
 
 .. warning::
 
@@ -43,66 +41,146 @@ The base prerequisites to install SmartSim and SmartRedis are:
   `which gcc g++` do not point to Apple Clang.
 
 
-GPU Support
-===========
+ML Library Support
+==================
 
-The machine-learning backends have additional requirements in order to
-use GPUs for inference
+We currently support both Nvidia and AMD GPUs when using RedisAI for GPU inference. The support
+for these GPUs often depends on the version of the CUDA or ROCm stack that is available on your
+machine. In _most_ cases, the versions are backwards compatible. If you encounter problems, please
+contact us and we can build the backend libraries for your desired version of CUDA and ROCm.
 
-  - `CUDA Toolkit 11 (tested with 11.8) <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html>`_
-  - `cuDNN 8 (tested with 8.9.1) <https://docs.nvidia.com/deeplearning/cudnn/installation/overview.html>`_
-  - OS: Linux
-  - GPU: Nvidia
+CPU backends are provided for Apple (both Intel and Apple Silicon) and Linux (x86_64).
 
-Be sure to reference the :ref:`installation notes <install-notes>` for helpful
+Be sure to reference the table below to find which versions of the ML libraries are supported for
+your particular platform. Additionally, see :ref:`installation notes <install-notes>` for helpful
 information regarding various system types before installation.
 
-==================
-Supported Versions
-==================
+Linux
+-----
 
+.. tabs::
 
-.. list-table:: Supported System for Pre-built Wheels
-   :widths: 50 50 50 50
-   :header-rows: 1
-   :align: center
+    .. group-tab:: CUDA 11
+
+      Additional requirements:
+
+      * GCC <= 11
+      * CUDA Toolkit 11.7 or 11.8
+      * cuDNN 8.9
+
+      .. list-table:: Nvidia CUDA 11
+         :widths: 50 50 50 50
+         :header-rows: 1
+         :align: center
+
+         * - Python Versions
+           - Torch
+           - Tensorflow
+           - ONNX Runtime
+         * - 3.9-3.11
+           - 2.3.1
+           - 2.14.1
+           - 1.17.3
+
+    .. group-tab:: CUDA 12
+
+      Additional requirements:
+
+      * CUDA Toolkit 12
+      * cuDNN 8.9
+
+      .. list-table:: Nvidia CUDA 12
+         :widths: 50 50 50 50
+         :header-rows: 1
+         :align: center
+
+         * - Python Versions
+           - Torch
+           - Tensorflow
+           - ONNX Runtime
+         * - 3.9-3.11
+           - 2.3.1
+           - 2.17
+           - 1.17.3
+
+    .. group-tab:: ROCm 6
+
+      .. list-table:: AMD ROCm 6.1
+         :widths: 50 50 50 50
+         :header-rows: 1
+         :align: center
+
+         * - Python Versions
+           - Torch
+           - Tensorflow
+           - ONNX Runtime
+         * - 3.9-3.11
+           - 2.4.1
+           - N/A
+           - N/A
+
+    .. group-tab:: CPU
+
+      .. list-table:: CPU-only
+         :widths: 50 50 50 50
+         :header-rows: 1
+         :align: center
+
+         * - Python Versions
+           - Torch
+           - Tensorflow
+           - ONNX Runtime
+         * - 3.9-3.11
+           - 2.4.0
+           - 2.15
+           - 1.17.3
+
+MacOSX
+------
 
-   * - Platform
-     - CPU
-     - GPU
-     - Python Versions
-   * - MacOS
-     - x86_64, aarch64
-     - Not supported
-     - 3.9 - 3.11
-   * - Linux
-     - x86_64
-     - Nvidia
-     - 3.9 - 3.11
+.. tabs::
 
+    .. group-tab:: Apple Silicon
 
-.. note::
+      .. list-table:: Apple Silicon ARM64 (no Metal support)
+         :widths: 50 50 50 50
+         :header-rows: 1
+         :align: center
 
-    Users have succesfully run SmartSim on Windows using Windows Subsystem for Linux
-    with Nvidia support. Generally, users should follow the Linux instructions here,
-    however we make no guarantee or offer of support.
+         * - Python Versions
+           - Torch
+           - Tensorflow
+           - ONNX Runtime
+         * - 3.9-3.11
+           - 2.4.0
+           - 2.17
+           - 1.17.3
 
+    .. group-tab:: Intel Mac (x86)
 
-Native support for various machine learning libraries and their
-versions is dictated by our dependency on RedisAI_ 1.2.7.
+      .. list-table:: CPU-only
+         :widths: 50 50 50 50
+         :header-rows: 1
+         :align: center
 
-+------------------+----------+-------------+---------------+
-| RedisAI          | PyTorch  | Tensorflow  | ONNX Runtime  |
-+==================+==========+=============+===============+
-| 1.2.7 (default)  | 2.0.1    | 2.13.1      | 1.16.3        |
-+------------------+----------+-------------+---------------+
+         * - Python Versions
+           - Torch
+           - Tensorflow
+           - ONNX Runtime
+         * - 3.9-3.11
+           - 2.2.0
+           - 2.15
+           - 1.17.3
 
-.. warning::
 
-  On Apple Silicon, only the PyTorch backend is supported for now. Please contact us
-  if you need support for other backends
+.. note::
+
+    Users have successfully run SmartSim on Windows using Windows Subsystem for Linux
+    with Nvidia support. Generally, users should follow the Linux instructions here,
+    however we make no guarantee or offer of support.
+
 
-TensorFlow_ 2.0 and Keras_ are supported through `graph freezing`_.
+TensorFlow_ and Keras_ are supported through `graph freezing`_.
 
 ScikitLearn_ and Spark_ models are supported by SmartSim as well
 through the use of the ONNX_ runtime (which is not built by
@@ -167,21 +245,8 @@ and install SmartSim from PyPI with the following command:
 
     pip install smartsim
 
-If you would like SmartSim to also install python machine learning libraries
-that can be used outside SmartSim to build SmartSim-compatible models, you
-can request their installation through the ``[ml]`` optional dependencies,
-as follows:
-
-.. code-block:: bash
-
-    # For bash
-    pip install smartsim[ml]
-    # For zsh
-    pip install smartsim\[ml\]
-
-At this point, SmartSim is installed and can be used for more basic features.
-If you want to use the machine learning features of SmartSim, you will need
-to install the ML backends in the section below.
+At this point, SmartSim can be used for describing and launching experiments, but
+without any database/feature store functionality which allows for ML-enabled workflows.
 
 
 Step 2: Build SmartSim
@@ -198,19 +263,19 @@ To see all the installation options:
 
     smart --help
 
-CPU Install
------------
-
-To install the default ML backends for CPU, run
-
 .. code-block:: bash
 
     # run one of the following
-    smart build --device cpu          # install PT and TF for cpu
-    smart build --device cpu --onnx   # install all backends (PT, TF, ONNX) on cpu
+    smart build --device cpu      # For unaccelerated AI/ML loads
+    smart build --device cuda118  # Nvidia Accelerator with CUDA 11.8
+    smart build --device cuda125  # Nvidia Accelerator with CUDA 12.5
+    smart build --device rocm57   # AMD Accelerator with ROCm 5.7.0
 
-By default, ``smart`` will install PyTorch and TensorFlow backends
-for use in SmartSim.
+By default, ``smart`` will install all backends available for the specified accelerator
+_and_ the compatible versions of the Python packages associated with the backends. To
+disable support for a specific backend, ``smart build`` accepts the flags
+``--skip-torch``, ``--skip-tensorflow``, ``--skip-onnx`` which can also be used in
+combination.
 
 .. note::
 
@@ -218,19 +283,6 @@ for use in SmartSim.
     all of the previous installs for the ML backends and ``smart clobber`` will
     remove all pre-built dependencies as well as the ML backends.
 
-
-GPU Install
------------
-
-With the proper environment setup (see :ref:`GPU support`) the only difference
-to building SmartSim with GPU support is to specify a different ``device``
-
-.. code-block:: bash
-
-    # run one of the following
-    smart build --device gpu          # install PT and TF for gpu
-    smart build --device gpu --onnx   # install all backends (PT, TF, ONNX) on gpu
-
 .. note::
 
   GPU builds can be troublesome due to the way that RedisAI and the ML-package
@@ -251,9 +303,21 @@ For example, to install dragon alongside the RedisAI CPU backends, you can run
 
 .. code-block:: bash
 
-    # run one of the following
     smart build --device cpu --dragon           # install Dragon, PT and TF for cpu
-    smart build --device cpu --onnx --dragon    # install Dragon and all backends (PT, TF, ONNX) on cpu
+
+``smart build`` supports installing a specific version of dragon. It exposes the
+parameters ``--dragon-repo`` and ``--dragon-version``, which can be used alone or
+in combination to customize the Dragon installation. For example:
+
+.. code-block:: bash
+
+    # using the --dragon-repo and --dragon-version flags to customize the Dragon installation
+    smart build --device cpu --dragon-repo userfork/dragon  # install Dragon from a specific repo
+    smart build --device cpu --dragon-version 0.10          # install a specific Dragon release
+
+    # combining both flags
+    smart build --device cpu --dragon-repo userfork/dragon --dragon-version 0.91
+
 
 .. note::
   Dragon is only supported on Linux systems. For further information, you
@@ -319,35 +383,11 @@ source remains at the site of the clone instead of in site-packages.
 .. code-block:: bash
 
   cd smartsim
-  pip install -e .[dev,ml]    # for bash users
-  pip install -e .\[dev,ml\]  # for zsh users
-
-Use the now installed ``smart`` cli to install the machine learning runtimes and dragon.
-
-.. tabs::
-
-  .. tab:: Linux
-
-    .. code-block:: bash
-
-      # run one of the following
-      smart build --device cpu --onnx --dragon  # install with cpu-only support
-      smart build --device gpu --onnx --dragon  # install with both cpu and gpu support
-
-
-  .. tab:: MacOS (Intel x64)
-
-    .. code-block:: bash
-
-      smart build --device cpu --onnx  # install all backends (PT, TF, ONNX) on gpu
-
-
-  .. tab:: MacOS (Apple Silicon)
-
-    .. code-block:: bash
-
-      smart build --device cpu --no_tf # Only install PyTorch (TF/ONNX unsupported)
+  pip install -e .[dev]       # for bash users
+  pip install -e ".[dev]"  # for zsh users
 
+Use the now installed ``smart`` cli to install the machine learning runtimes and
+dragon. Referring to "Step 2: Build SmartSim" above.
 
 Build the SmartRedis library
 ============================
diff --git a/doc/installation_instructions/platform.rst b/doc/installation_instructions/platform.rst
index 086fc2951c..c1eb51df1a 100644
--- a/doc/installation_instructions/platform.rst
+++ b/doc/installation_instructions/platform.rst
@@ -12,12 +12,16 @@ that SmartSim may be used on.
 
 .. include:: platform/frontier.rst
 
+.. include:: platform/perlmutter.rst
+
 .. include:: platform/cray.rst
 
 .. include:: platform/ncar-cheyenne.rst
 
 .. include:: platform/olcf-summit.rst
 
+.. include:: platform/pml-scylla.rst
+
 .. _site_installation:
 
 .. include:: site-install.rst
diff --git a/doc/installation_instructions/platform/frontier.rst b/doc/installation_instructions/platform/frontier.rst
index e238561559..9b05061fe1 100644
--- a/doc/installation_instructions/platform/frontier.rst
+++ b/doc/installation_instructions/platform/frontier.rst
@@ -1,23 +1,15 @@
 OLCF Frontier
 =============
 
-Summary
--------
-
-Frontier is an AMD CPU/AMD GPU system.
-
-As of 2023-07-06, users can use the following instructions, however we
-anticipate that all the SmartSim dependencies will be available system-wide via
-the modules system.
-
 Known limitations
 -----------------
 
 We are continually working on getting all the features of SmartSim working on
 Frontier, however we do have some known limitations:
 
-* For now, only Torch models are supported. We are working to find a recipe to
-  install Tensorflow with ROCm support from scratch
+* For now, only Torch models are supported. If you need Tensorflow or ONNX
+  support please contact us
+* All SmartSim experiments must be run from Lustre, _not_ your home directory
 * The colocated database will fail without specifying ``custom_pinning``. This
   is because the default pinning assumes that processor 0 is available, but the
   'low-noise' default on Frontier reserves the processor on each NUMA node.
@@ -30,8 +22,8 @@ Frontier, however we do have some known limitations:
 Please raise an issue in the SmartSim Github or contact the developers if the above
 issues are affecting your workflow or if you find any other problems.
 
-Build process
--------------
+One-time Setup
+--------------
 
 To install the SmartRedis and SmartSim python packages on Frontier, please follow
 these instructions, being sure to set the following variables
@@ -39,25 +31,20 @@ these instructions, being sure to set the following variables
 .. code:: bash
 
    export PROJECT_NAME=CHANGE_ME
-   export VENV_NAME=CHANGE_ME
 
-Then continue with the install:
+**Step 1:** Create and activate a virtual environment for SmartSim:
 
 .. code:: bash
 
-   module load PrgEnv-gnu-amd git-lfs cmake cray-python
-   module unload xalt amd-mixed
-   module load rocm/4.5.2
-   export CC=gcc
-   export CXX=g++
+   module load PrgEnv-gnu miniforge3 rocm/6.1.3
 
    export SCRATCH=/lustre/orion/$PROJECT_NAME/scratch/$USER/
-   export VENV_HOME=$SCRATCH/$VENV_NAME/
+   conda create -n smartsim python=3.11
+   source activate smartsim
 
-   python3 -m venv $VENV_HOME
-   source $VENV_HOME/bin/activate
-   pip install torch==1.11.0+rocm4.5.2 torchvision==0.12.0+rocm4.5.2 torchaudio==0.11.0  --extra-index-url  https://download.pytorch.org/whl/rocm4.5.2
+**Step 2:** Build the SmartRedis C++ and Fortran libraries:
 
+.. code:: bash
 
    cd $SCRATCH
    git clone https://github.com/CrayLabs/SmartRedis.git
@@ -65,57 +52,61 @@ Then continue with the install:
    make lib-with-fortran
    pip install .
 
-   # Download SmartSim and site-specific files
+**Step 3:** Install SmartSim in the conda environment:
+
+.. code:: bash
+
    cd $SCRATCH
-   git clone https://github.com/CrayLabs/site-deployments.git
-   git clone https://github.com/CrayLabs/SmartSim.git
-   cd SmartSim
-   pip install -e .[dev]
+   pip install git+https://github.com/CrayLabs/SmartSim.git
 
-Next to finish the compilation, we need to manually modify one of the auxiliary
-cmake files that comes packaged with Torch
+**Step 4:** Build Redis, RedisAI, the backends, and all the Python packages:
 
 .. code:: bash
 
-   export TORCH_CMAKE_DIR=$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')
-   # Manual step: modify all references to the 'rocm' directory to rocm-4.5.2
-   vim $TORCH_CMAKE_DIR/Caffe2/Caffe2Targets.cmake
+   smart build --device=rocm-6
 
-Finally, build Redis (or keydb for a more performant solution), RedisAI, and the
-machine-learning backends using:
+**Step 5:** Check that SmartSim has been installed and built correctly:
 
 .. code:: bash
 
-   KEYDB_FLAG="" # set this to --keydb if desired
-   smart build --device gpu --torch_dir $TORCH_CMAKE_DIR --no_tf -v $(KEYDB_FLAG)
+   # Optimizations for inference
+   export MIOPEN_USER_DB_PATH="/tmp/${USER}/my-miopen-cache"
+   export MIOPEN_CUSTOM_CACHE_DIR=$MIOPEN_USER_DB_PATH
+   rm -rf $MIOPEN_USER_DB_PATH
+   mkdir -p $MIOPEN_USER_DB_PATH
+
+   # Run the install validation utility
+   smart validate --device gpu
 
-Set up environment
-------------------
+The following output indicates a successful install:
+
+.. code:: bash
+
+   [SmartSim] INFO Verifying Tensor Transfer
+   [SmartSim] INFO Verifying Torch Backend
+   16:26:35 login SmartSim[557020:MainThread] INFO Success!
+
+Post-installation
+-----------------
 
 Before running SmartSim, the environment should match the one used to
-build, and some variables should be set to work around some ROCm PyTorch
-issues:
+build, and some variables should be set to optimize performance:
 
 .. code:: bash
 
    # Set these to the same values that were used for install
    export PROJECT_NAME=CHANGE_ME
-   export VENV_NAME=CHANGE_ME
 
 .. code:: bash
 
-   module load PrgEnv-gnu-amd git-lfs cmake cray-python
-   module unload xalt amd-mixed
-   module load rocm/4.5.2
+   module load PrgEnv-gnu miniforge3 rocm/6.1.3
+   source activate smartsim
 
-   export SCRATCH=/lustre/orion/$PROJECT_NAME/scratch/$USER/
-   export MIOPEN_USER_DB_PATH=/tmp/miopendb/
-   export MIOPEN_SYSTEM_DB_PATH=$MIOPEN_USER_DB_PATH
-   mkdir -p $MIOPEN_USER_DB_PATH
-   export MIOPEN_DISABLE_CACHE=1
-   export VENV_HOME=$SCRATCH/$VENV_NAME/
-   source $VENV_HOME/bin/activate
-   export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$VENV_HOME/lib/python3.9/site-packages/torch/lib
+   # Optimizations for inference
+   export MIOPEN_USER_DB_PATH="/tmp/${USER}/my-miopen-cache"
+   export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
+   rm -rf ${MIOPEN_USER_DB_PATH}
+   mkdir -p ${MIOPEN_USER_DB_PATH}
 
 Binding DBs to Slingshot
 ------------------------
@@ -129,17 +120,3 @@ following way:
 
    exp = Experiment("my_exp", launcher="slurm")
    orc = exp.create_database(db_nodes=3, interface=["hsn0","hsn1","hsn2","hsn3"], single_cmd=True)
-
-Running tests
--------------
-
-The same environment set to run SmartSim must be set to run tests. The
-environment variables needed to run the test suite are the following:
-
-.. code:: bash
-
-   export SMARTSIM_TEST_ACCOUNT=PROJECT_NAME # Change this to above
-   export SMARTSIM_TEST_LAUNCHER=slurm
-   export SMARTSIM_TEST_DEVICE=gpu
-   export SMARTSIM_TEST_PORT=6789
-   export SMARTSIM_TEST_INTERFACE="hsn0,hsn1,hsn2,hsn3"
diff --git a/doc/installation_instructions/platform/olcf-summit.rst b/doc/installation_instructions/platform/olcf-summit.rst
index 7e2ba513da..07be24eec7 100644
--- a/doc/installation_instructions/platform/olcf-summit.rst
+++ b/doc/installation_instructions/platform/olcf-summit.rst
@@ -19,7 +19,7 @@ into problems.
 .. code-block:: bash
 
   # setup Python and build environment
-  export ENV_NAME=smartsim-0.7.0
+  export ENV_NAME=smartsim-0.8.0
   git clone https://github.com/CrayLabs/SmartRedis.git smartredis
   git clone https://github.com/CrayLabs/SmartSim.git smartsim
   conda config --prepend channels https://ftp.osuosl.org/pub/open-ce/1.6.1/
diff --git a/doc/installation_instructions/platform/perlmutter.rst b/doc/installation_instructions/platform/perlmutter.rst
new file mode 100644
index 0000000000..71f97a4dc9
--- /dev/null
+++ b/doc/installation_instructions/platform/perlmutter.rst
@@ -0,0 +1,64 @@
+NERSC Perlmutter
+================
+
+One-time Setup
+--------------
+
+To install SmartSim on Perlmutter, follow these steps:
+
+**Step 1:** Create and activate a conda environment for SmartSim:
+
+.. code:: bash
+
+    module load conda cudatoolkit/12.2 cudnn/8.9.3_cuda12 PrgEnv-gnu
+    conda create -n smartsim python=3.11
+    conda activate smartsim
+
+**Step 2:** Build the SmartRedis C++ and Fortran libraries:
+
+.. code:: bash
+
+    git clone https://github.com/CrayLabs/SmartRedis.git
+    cd SmartRedis
+    make lib-with-fortran
+    pip install .
+    cd ..
+
+**Step 3:** Install SmartSim in the conda environment:
+
+.. code:: bash
+
+    pip install git+https://github.com/CrayLabs/SmartSim.git
+
+**Step 4:** Build Redis, RedisAI, the backends, and all the Python packages:
+
+.. code:: bash
+
+    smart build --device=cuda-12
+
+**Step 5:** Check that SmartSim has been installed and built correctly:
+
+.. code:: bash
+
+    smart validate --device gpu
+
+The following output indicates a successful install:
+
+.. code:: bash
+
+    [SmartSim] INFO Verifying Tensor Transfer
+    [SmartSim] INFO Verifying Torch Backend
+    [SmartSim] INFO Verifying ONNX Backend
+    [SmartSim] INFO Verifying TensorFlow Backend
+    16:26:35 login SmartSim[557020:MainThread] INFO Success!
+
+Post-installation
+-----------------
+
+After completing the above steps to install SmartSim in a conda environment, you
+can reload the conda environment by running the following commands:
+
+.. code:: bash
+
+    module load conda cudatoolkit/12.2 cudnn/8.9.3_cuda12 PrgEnv-gnu
+    conda activate smartsim
diff --git a/doc/installation_instructions/platform/pml-scylla.rst b/doc/installation_instructions/platform/pml-scylla.rst
new file mode 100644
index 0000000000..c13f178213
--- /dev/null
+++ b/doc/installation_instructions/platform/pml-scylla.rst
@@ -0,0 +1,84 @@
+PML Scylla
+==========
+
+.. warning::
+    As of September 2024, the software stack on Scylla is still being finalized.
+    Therefore, please consider these instructions as preliminary for now.
+
+One-time Setup
+--------------
+
+To install SmartSim on Scylla, follow these steps:
+
+**Step 1:** Create and activate a Python virtual environment for SmartSim:
+
+.. code:: bash
+
+    module use module use /scyllapfs/hpe/ashao/smartsim_dependencies/modulefiles
+    module load cudatoolkit cudnn git
+    python -m venv /scyllafps/scratch/$USER/venvs/smartsim
+    source /scyllafps/scratch/$USER/venvs/smartsim/bin/activate
+
+**Step 2:** Build the SmartRedis C++ and Fortran libraries:
+
+.. code:: bash
+
+    git clone https://github.com/CrayLabs/SmartRedis.git
+    cd SmartRedis
+    make lib-with-fortran
+    pip install .
+    cd ..
+
+**Step 3:** Install SmartSim in the conda environment:
+
+.. code:: bash
+
+    pip install git+https://github.com/CrayLabs/SmartSim.git
+
+**Step 4:** Build Redis, RedisAI, the backends, and all the Python packages:
+
+.. code:: bash
+
+    export TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0" # Workaround for a PyTorch problem
+    smart build --device=cuda-12
+    module unload cudnn # Workaround for a PyTorch problem
+
+
+.. note::
+    The first workaround is needed because for some reason the autodetection
+    of CUDA architectures is not consistent internally with one of PyTorch's
+    dependencies. This seems to be unique to this machine as we do not see
+    this on other platforms.
+
+    The second workaround is needed because PyTorch 2.3 (and possibly 2.2)
+    will attempt to load the version of cuDNN that is in the LD_LIBRARY_PATH
+    instead of the version shipped with PyTorch itself. This results in
+    unfound symbols.
+
+**Step 5:** Check that SmartSim has been installed and built correctly:
+
+.. code:: bash
+
+    srun -n 1 -p gpu --gpus=1 --pty smart validate --device gpu
+
+The following output indicates a successful install:
+
+.. code:: bash
+
+    [SmartSim] INFO Verifying Tensor Transfer
+    [SmartSim] INFO Verifying Torch Backend
+    [SmartSim] INFO Verifying ONNX Backend
+    [SmartSim] INFO Verifying TensorFlow Backend
+    16:26:35 login SmartSim[557020:MainThread] INFO Success!
+
+Post-installation
+-----------------
+
+After completing the above steps to install SmartSim in a conda environment, you
+can reload the conda environment by running the following commands:
+
+.. code:: bash
+
+    module load cudatoolkit/12.4.1 git # cudnn should NOT be loaded
+    source /scyllafps/scratch/$USER/venvs/smartsim/bin/activate
+
diff --git a/doc/installation_instructions/site-install.rst b/doc/installation_instructions/site-install.rst
index 26ecd6c138..53e0ff8bf0 100644
--- a/doc/installation_instructions/site-install.rst
+++ b/doc/installation_instructions/site-install.rst
@@ -11,5 +11,5 @@ from source with the following steps replacing ``COMPILER_VERSION`` and
 
     module use -a /lus/scratch/smartsim/local/modulefiles
     module load cudatoolkit/11.8 cudnn smartsim-deps/COMPILER_VERSION/SMARTSIM_VERSION
-    pip install smartsim[ml]
-    smart build --only_python_packages --device gpu [--onnx]
+    pip install smartsim
+    smart build --skip-backends --device gpu [--onnx]
diff --git a/doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb b/doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb
index 2d19cab138..4afdc38955 100644
--- a/doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb
+++ b/doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb
@@ -44,8 +44,9 @@
    ],
    "source": [
     "## Installing the ML backends\n",
-    "from smartsim._core.utils.helpers import installed_redisai_backends\n",
-    "print(installed_redisai_backends())\n"
+    "# from smartsim._core.utils.helpers import installed_redisai_backends\n",
+    "#print(installed_redisai_backends())\n",
+    "# TODO: replace deprecated installed_redisai_backends"
    ]
   },
   {
@@ -132,7 +133,7 @@
       "\n",
       "ML Backends Requested\n",
       "╒════════════╤════════╤══════╕\n",
-      "│ PyTorch    │ 2.0.1  │ \u001b[32mTrue\u001b[0m │\n",
+      "│ PyTorch    │ 2.1.0  │ \u001b[32mTrue\u001b[0m │\n",
       "│ TensorFlow │ 2.13.1 │ \u001b[32mTrue\u001b[0m │\n",
       "│ ONNX       │ 1.14.1 │ \u001b[32mTrue\u001b[0m │\n",
       "╘════════════╧════════╧══════╛\n",
diff --git a/docker-compose.yml b/docker-compose.yml
index 0473616560..e652591620 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -14,9 +14,9 @@ services:
       - "8888:8888"
 
   tutorials-prod:
-    image: smartsim-tutorials:v0.7.0
+    image: smartsim-tutorials:v0.8.0
     build:
       context: .
       dockerfile: ./docker/prod/Dockerfile
     ports:
-      - "8888:8888"
\ No newline at end of file
+      - "8888:8888"
diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile
index bc92e2fd79..faeeae8f37 100644
--- a/docker/dev/Dockerfile
+++ b/docker/dev/Dockerfile
@@ -50,7 +50,7 @@ COPY . /home/craylabs/SmartSim
 RUN chown craylabs:root -R SmartSim
 USER craylabs
 
-RUN cd SmartSim && SMARTSIM_SUFFIX=dev python -m pip install .[ml]
+RUN cd SmartSim && SMARTSIM_SUFFIX=dev python -m pip install .
 
 RUN export PATH=/home/craylabs/.local/bin:$PATH && \
     echo "export PATH=/home/craylabs/.local/bin:$PATH" >> /home/craylabs/.bashrc && \
diff --git a/docker/prod-cuda11/Dockerfile b/docker/prod-cuda11/Dockerfile
new file mode 100644
index 0000000000..fc27479051
--- /dev/null
+++ b/docker/prod-cuda11/Dockerfile
@@ -0,0 +1,61 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+FROM ubuntu:22.04
+
+LABEL maintainer="Cray Labs"
+LABEL org.opencontainers.image.source https://github.com/CrayLabs/SmartSim
+
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV TZ=US/Seattle
+
+# Make basic dependencies
+RUN apt-get update \
+    && apt-get install --no-install-recommends -y build-essential \
+    git gcc make git-lfs wget libopenmpi-dev openmpi-bin unzip \
+    python3-pip python3 python3-dev cmake wget apt-utils
+
+# # Install Cudatoolkit 11.8
+ENV TERM="xterm"
+RUN wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run && \
+    chmod +x ./cuda_11.8.0_520.61.05_linux.run && \
+    ./cuda_11.8.0_520.61.05_linux.run --silent --toolkit && \
+    rm ./cuda_11.8.0_520.61.05_linux.run
+
+# Install cuDNN 8.9.7
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcudnn8_8.9.7.29-1+cuda11.8_amd64.deb && \
+    dpkg -i libcudnn8_8.9.7.29-1+cuda11.8_amd64.deb && \
+    rm ./libcudnn8_8.9.7.29-1+cuda11.8_amd64.deb
+
+ # Install SmartSim and SmartRedis
+ RUN pip install git+https://github.com/CrayLabs/SmartRedis.git && \
+     pip install "smartsim @ git+https://github.com/CrayLabs/SmartSim.git"
+
+ ENV CUDA_HOME="/usr/local/cuda/"
+ ENV PATH="${PATH}:${CUDA_HOME}/bin"
+
+ # Build ML Backends
+ RUN smart build --device=gpu --onnx
diff --git a/docker/prod-cuda12/Dockerfile b/docker/prod-cuda12/Dockerfile
new file mode 100644
index 0000000000..bbdfd35131
--- /dev/null
+++ b/docker/prod-cuda12/Dockerfile
@@ -0,0 +1,64 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+FROM ubuntu:22.04
+
+LABEL maintainer="Cray Labs"
+LABEL org.opencontainers.image.source https://github.com/CrayLabs/SmartSim
+
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV TZ=US/Seattle
+
+# Make basic dependencies
+RUN apt-get update \
+    && apt-get install --no-install-recommends -y build-essential \
+    git gcc make git-lfs wget libopenmpi-dev openmpi-bin unzip \
+    python3-pip python3 python3-dev cmake wget
+
+# Install Cudatoolkit 12.5
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
+    dpkg -i cuda-keyring_1.1-1_all.deb && \
+    apt-get update -y && \
+    apt-get install -y cuda-toolkit-12-5
+
+# Install cuDNN 8.9.7
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcudnn8_8.9.7.29-1+cuda12.2_amd64.deb && \
+    dpkg -i libcudnn8_8.9.7.29-1+cuda12.2_amd64.deb
+
+# Install SmartSim and SmartRedis
+RUN pip install git+https://github.com/CrayLabs/SmartRedis.git && \
+    pip install git+https://github.com/CrayLabs/SmartSim.git@cuda-12-support
+
+ENV CUDA_HOME="/usr/local/cuda/"
+ENV PATH="${PATH}:${CUDA_HOME}/bin"
+
+# Install machine-learning python packages consistent with RedisAI
+# Note: pytorch gets installed in the smart build step
+# This step will be deprecated in a future update
+RUN pip install tensorflow==2.15.0
+
+# Build ML Backends
+RUN smart build --device=cuda121
diff --git a/docker/prod/Dockerfile b/docker/prod/Dockerfile
index 0f5b8dafc5..f8560f7bda 100644
--- a/docker/prod/Dockerfile
+++ b/docker/prod/Dockerfile
@@ -46,7 +46,7 @@ COPY --chown=craylabs:root ./doc/tutorials/ /home/craylabs/tutorials/
 USER craylabs
 RUN export PATH=/home/craylabs/.local/bin:$PATH && \
     echo "export PATH=/home/craylabs/.local/bin:$PATH" >> /home/craylabs/.bashrc && \
-    python -m pip install smartsim[ml]==0.7.0 jupyter jupyterlab "ipython<8" matplotlib && \
+    python -m pip install smartsim==0.8.0 jupyter jupyterlab "ipython<8" matplotlib && \
     smart build --device cpu -v && \
     chown craylabs:root -R /home/craylabs/.local && \
     rm -rf ~/.cache/pip
diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
new file mode 100644
index 0000000000..36f427937c
--- /dev/null
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -0,0 +1,77 @@
+import os
+import base64
+import cloudpickle
+import sys
+from smartsim import Experiment
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim.status import TERMINAL_STATUSES
+from smartsim.settings import DragonRunSettings
+import time
+import typing as t
+
+DEVICE = "gpu"
+NUM_RANKS = 4
+NUM_WORKERS = 1
+filedir = os.path.dirname(__file__)
+worker_manager_script_name = os.path.join(filedir, "standalone_worker_manager.py")
+app_script_name = os.path.join(filedir, "mock_app.py")
+model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt")
+
+transport: t.Literal["hsta", "tcp"] = "hsta"
+
+os.environ["SMARTSIM_DRAGON_TRANSPORT"] = transport
+
+exp_path = os.path.join(filedir, f"MLI_proto_{transport.upper()}")
+os.makedirs(exp_path, exist_ok=True)
+exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path)
+
+torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii")
+
+worker_manager_rs: DragonRunSettings = exp.create_run_settings(
+    sys.executable,
+    [
+        worker_manager_script_name,
+        "--device",
+        DEVICE,
+        "--worker_class",
+        torch_worker_str,
+        "--batch_size",
+        str(NUM_RANKS//NUM_WORKERS),
+        "--batch_timeout",
+        str(0.00),
+        "--num_workers",
+        str(NUM_WORKERS)
+    ],
+)
+
+aff = []
+
+worker_manager_rs.set_cpu_affinity(aff)
+
+worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
+worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
+
+app_rs: DragonRunSettings = exp.create_run_settings(
+    sys.executable,
+    exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(6)],
+)
+app_rs.set_tasks_per_node(NUM_RANKS)
+
+
+app = exp.create_model("app", run_settings=app_rs)
+app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
+
+exp.generate(worker_manager, app, overwrite=True)
+exp.start(worker_manager, app, block=False)
+
+while True:
+    if exp.get_status(app)[0] in TERMINAL_STATUSES:
+        time.sleep(10)
+        exp.stop(worker_manager)
+        break
+    if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES:
+        time.sleep(10)
+        exp.stop(app)
+        break
+
+print("Exiting.")
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
new file mode 100644
index 0000000000..c3b3eaaf4c
--- /dev/null
+++ b/ex/high_throughput_inference/mock_app.py
@@ -0,0 +1,142 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# isort: off
+import dragon
+from dragon import fli
+from dragon.channels import Channel
+import dragon.channels
+from dragon.data.ddict.ddict import DDict
+from dragon.globalservices.api_setup import connect_to_infrastructure
+from dragon.utils import b64decode, b64encode
+
+# isort: on
+
+import argparse
+import io
+
+import torch
+
+from smartsim.log import get_logger
+
+torch.set_num_interop_threads(16)
+torch.set_num_threads(1)
+
+logger = get_logger("App")
+logger.info("Started app")
+
+from collections import OrderedDict
+
+from smartsim.log import get_logger, log_to_file
+from smartsim._core.mli.client.protoclient import ProtoClient
+
+logger = get_logger("App")
+
+
+CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False
+
+
+class ResNetWrapper:
+    """Wrapper around a pre-rained ResNet model."""
+    def __init__(self, name: str, model: str):
+        """Initialize the instance.
+
+        :param name: The name to use for the model
+        :param model: The path to the pre-trained PyTorch model"""
+        self._model = torch.jit.load(model)
+        self._name = name
+        buffer = io.BytesIO()
+        scripted = torch.jit.trace(self._model, self.get_batch())
+        torch.jit.save(scripted, buffer)
+        self._serialized_model = buffer.getvalue()
+
+    def get_batch(self, batch_size: int = 32):
+        """Create a random batch of data with the correct dimensions to
+        invoke a ResNet model.
+
+        :param batch_size: The desired number of samples to produce
+        :returns: A PyTorch tensor"""
+        return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
+
+    @property
+    def model(self) -> bytes:
+        """The content of a model file.
+
+        :returns: The model bytes"""
+        return self._serialized_model
+
+    @property
+    def name(self) -> str:
+        """The name applied to the model.
+
+        :returns: The name"""
+        return self._name
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser("Mock application")
+    parser.add_argument("--device", default="cpu", type=str)
+    parser.add_argument("--log_max_batchsize", default=8, type=int)
+    args = parser.parse_args()
+
+    resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt")
+
+    client = ProtoClient(timing_on=True)
+    client.set_model(resnet.name, resnet.model)
+
+    if CHECK_RESULTS_AND_MAKE_ALL_SLOWER:
+        # TODO: adapt to non-Nvidia devices
+        torch_device = args.device.replace("gpu", "cuda")
+        pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to(
+            torch_device
+        )
+
+    TOTAL_ITERATIONS = 100
+
+    for log2_bsize in range(args.log_max_batchsize + 1):
+        b_size: int = 2**log2_bsize
+        logger.info(f"Batch size: {b_size}")
+        for iteration_number in range(TOTAL_ITERATIONS + int(b_size == 1)):
+            logger.info(f"Iteration: {iteration_number}")
+            sample_batch = resnet.get_batch(b_size)
+            remote_result = client.run_model(resnet.name, sample_batch)
+            logger.info(client.perf_timer.get_last("total_time"))
+            if CHECK_RESULTS_AND_MAKE_ALL_SLOWER:
+                local_res = pt_model(sample_batch.to(torch_device))
+                err_norm = torch.linalg.vector_norm(
+                    torch.flatten(remote_result).to(torch_device)
+                    - torch.flatten(local_res),
+                    ord=1,
+                ).cpu()
+                res_norm = torch.linalg.vector_norm(remote_result, ord=1).item()
+                local_res_norm = torch.linalg.vector_norm(local_res, ord=1).item()
+                logger.info(
+                    f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}"
+                )
+                torch.cuda.synchronize()
+
+    client.perf_timer.print_timings(to_file=True)
diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py
new file mode 100644
index 0000000000..8978bcea23
--- /dev/null
+++ b/ex/high_throughput_inference/mock_app_redis.py
@@ -0,0 +1,90 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import io
+import numpy
+import time
+import torch
+from mpi4py import MPI
+from smartsim.log import get_logger
+from smartsim._core.utils.timings import PerfTimer
+from smartredis import Client
+
+logger = get_logger("App")
+
+class ResNetWrapper():
+    def __init__(self, name: str, model: str):
+        self._model = torch.jit.load(model)
+        self._name = name
+        buffer = io.BytesIO()
+        scripted = torch.jit.trace(self._model, self.get_batch())
+        torch.jit.save(scripted, buffer)
+        self._serialized_model = buffer.getvalue()
+
+    def get_batch(self, batch_size: int=32):
+        return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
+
+    @property
+    def model(self):
+        return self._serialized_model
+
+    @property
+    def name(self):
+        return self._name
+
+if __name__ == "__main__":
+
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+
+    parser = argparse.ArgumentParser("Mock application")
+    parser.add_argument("--device", default="cpu")
+    args = parser.parse_args()
+
+    resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt")
+
+    client = Client(cluster=False, address=None)
+    client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper())
+
+    perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"redis{rank}_")
+
+    total_iterations = 100
+    timings=[]
+    for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]:
+        logger.info(f"Batch size: {batch_size}")
+        for iteration_number in range(total_iterations + int(batch_size==1)):
+            perf_timer.start_timings("batch_size", batch_size)
+            logger.info(f"Iteration: {iteration_number}")
+            input_name = f"batch_{rank}"
+            output_name = f"result_{rank}"
+            client.put_tensor(name=input_name, data=resnet.get_batch(batch_size).numpy())
+            client.run_model(name=resnet.name, inputs=[input_name], outputs=[output_name])
+            result = client.get_tensor(name=output_name)
+            perf_timer.end_timings()
+
+
+    perf_timer.print_timings(True)
diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py
new file mode 100644
index 0000000000..ff57725d40
--- /dev/null
+++ b/ex/high_throughput_inference/redis_driver.py
@@ -0,0 +1,66 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import sys
+from smartsim import Experiment
+from smartsim.status import TERMINAL_STATUSES
+import time
+
+DEVICE = "gpu"
+filedir = os.path.dirname(__file__)
+app_script_name = os.path.join(filedir, "mock_app_redis.py")
+model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt")
+
+
+exp_path = os.path.join(filedir, "redis_ai_multi")
+os.makedirs(exp_path, exist_ok=True)
+exp = Experiment("redis_ai_multi", launcher="slurm", exp_path=exp_path)
+
+db = exp.create_database(interface="hsn0")
+
+app_rs = exp.create_run_settings(
+    sys.executable, exe_args = [app_script_name, "--device", DEVICE]
+    )
+app_rs.set_nodes(1)
+app_rs.set_tasks(4)
+app = exp.create_model("app", run_settings=app_rs)
+app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
+
+exp.generate(db, app, overwrite=True)
+
+exp.start(db, app, block=False)
+
+while True:
+    if exp.get_status(app)[0] in TERMINAL_STATUSES:
+        exp.stop(db)
+        break
+    if exp.get_status(db)[0] in TERMINAL_STATUSES:
+        exp.stop(app)
+        break
+    time.sleep(5)
+
+print("Exiting.")
\ No newline at end of file
diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py
new file mode 100644
index 0000000000..b4527bc5d2
--- /dev/null
+++ b/ex/high_throughput_inference/standalone_worker_manager.py
@@ -0,0 +1,218 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import dragon
+
+# pylint disable=import-error
+import dragon.infrastructure.policy as dragon_policy
+import dragon.infrastructure.process_desc as dragon_process_desc
+import dragon.native.process as dragon_process
+from dragon import fli
+from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
+from dragon.globalservices.api_setup import connect_to_infrastructure
+from dragon.managed_memory import MemoryPool
+from dragon.utils import b64decode, b64encode
+
+# pylint enable=import-error
+
+# isort: off
+# isort: on
+
+import argparse
+import base64
+import multiprocessing as mp
+import os
+import socket
+import time
+import typing as t
+
+import cloudpickle
+
+from smartsim._core.entrypoints.service import Service
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.control.request_dispatcher import (
+    RequestDispatcher,
+)
+from smartsim._core.mli.infrastructure.control.worker_manager import WorkerManager
+from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+    DragonFeatureStore,
+)
+from smartsim.log import get_logger
+
+logger = get_logger("Worker Manager Entry Point")
+
+mp.set_start_method("dragon")
+
+pid = os.getpid()
+affinity = os.sched_getaffinity(pid)
+logger.info(f"Entry point: {socket.gethostname()}, {affinity}")
+logger.info(f"CPUS: {os.cpu_count()}")
+
+
+def service_as_dragon_proc(
+    service: Service, cpu_affinity: list[int], gpu_affinity: list[int]
+) -> dragon_process.Process:
+
+    options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
+    local_policy = dragon_policy.Policy(
+        placement=dragon_policy.Policy.Placement.HOST_NAME,
+        host_name=socket.gethostname(),
+        cpu_affinity=cpu_affinity,
+        gpu_affinity=gpu_affinity,
+    )
+    return dragon_process.Process(
+        target=service.execute,
+        args=[],
+        cwd=os.getcwd(),
+        policy=local_policy,
+        options=options,
+        stderr=dragon_process.Popen.STDOUT,
+        stdout=dragon_process.Popen.STDOUT,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Worker Manager")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="gpu",
+        choices="gpu cpu".split(),
+        help="Device on which the inference takes place",
+    )
+    parser.add_argument(
+        "--worker_class",
+        type=str,
+        required=True,
+        help="Serialized class of worker to run",
+    )
+    parser.add_argument(
+        "--num_workers", type=int, default=1, help="Number of workers to run"
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="How many requests the workers will try to aggregate before processing them",
+    )
+    parser.add_argument(
+        "--batch_timeout",
+        type=float,
+        default=0.001,
+        help="How much time (in seconds) should be waited before processing an incomplete aggregated request",
+    )
+    args = parser.parse_args()
+
+    connect_to_infrastructure()
+    ddict_str = os.environ[BackboneFeatureStore.MLI_BACKBONE]
+
+    backbone = BackboneFeatureStore.from_descriptor(ddict_str)
+
+    to_worker_channel = create_local()
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli)
+
+    backbone.worker_queue = to_worker_fli_comm_ch.descriptor
+
+    os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = to_worker_fli_comm_ch.descriptor
+    os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor
+
+    arg_worker_type = cloudpickle.loads(
+        base64.b64decode(args.worker_class.encode("ascii"))
+    )
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    dispatcher = RequestDispatcher(
+        batch_timeout=args.batch_timeout,
+        batch_size=args.batch_size,
+        config_loader=config_loader,
+        worker_type=arg_worker_type,
+    )
+
+    wms = []
+    worker_device = args.device
+    for wm_idx in range(args.num_workers):
+
+        worker_manager = WorkerManager(
+            config_loader=config_loader,
+            worker_type=arg_worker_type,
+            as_service=True,
+            cooldown=10,
+            device=worker_device,
+            dispatcher_queue=dispatcher.task_queue,
+        )
+
+        wms.append(worker_manager)
+
+    wm_affinity: list[int] = []
+    disp_affinity: list[int] = []
+
+    # This is hardcoded for a specific type of node:
+    # the GPU-to-CPU mapping is taken from the nvidia-smi tool
+    # TODO can this be computed on the fly?
+    gpu_to_cpu_aff: dict[int, list[int]] = {}
+    gpu_to_cpu_aff[0] = list(range(48, 64)) + list(range(112, 128))
+    gpu_to_cpu_aff[1] = list(range(32, 48)) + list(range(96, 112))
+    gpu_to_cpu_aff[2] = list(range(16, 32)) + list(range(80, 96))
+    gpu_to_cpu_aff[3] = list(range(0, 16)) + list(range(64, 80))
+
+    worker_manager_procs = []
+    for worker_idx in range(args.num_workers):
+        wm_cpus = len(gpu_to_cpu_aff[worker_idx]) - 4
+        wm_affinity = gpu_to_cpu_aff[worker_idx][:wm_cpus]
+        disp_affinity.extend(gpu_to_cpu_aff[worker_idx][wm_cpus:])
+        worker_manager_procs.append(
+            service_as_dragon_proc(
+                worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[worker_idx]
+            )
+        )
+
+    dispatcher_proc = service_as_dragon_proc(
+        dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[]
+    )
+
+    # TODO: use ProcessGroup and restart=True?
+    all_procs = [dispatcher_proc, *worker_manager_procs]
+
+    print(f"Dispatcher proc: {dispatcher_proc}")
+    for proc in all_procs:
+        proc.start()
+
+    while all(proc.is_alive for proc in all_procs):
+        time.sleep(1)
diff --git a/pyproject.toml b/pyproject.toml
index 5b81676a35..bf721b0c99 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,6 +69,7 @@ markers = [
   "group_a: fast test subset a",
   "group_b: fast test subset b",
   "slow_tests: tests that take a long duration to complete",
+  "dragon: tests that must be executed in a dragon runtime",
 ]
 
 [tool.isort]
diff --git a/setup.py b/setup.py
index 328bf1ffb6..cd5ace55db 100644
--- a/setup.py
+++ b/setup.py
@@ -119,6 +119,7 @@
 class BuildError(Exception):
     pass
 
+
 # Define needed dependencies for the installation
 
 extras_require = {
@@ -137,7 +138,7 @@ class BuildError(Exception):
         "types-redis",
         "types-tabulate",
         "types-tqdm",
-        "types-tensorflow==2.12.0.9",
+        "types-tensorflow",
         "types-setuptools",
         "typing_extensions>=4.1.0",
     ],
@@ -151,7 +152,7 @@ class BuildError(Exception):
         "nbsphinx==0.9.3",
         "docutils==0.18.1",
         "torch==2.0.1",
-        "tensorflow==2.13.1",
+        "tensorflow>=2.14,<3.0",
         "ipython",
         "jinja2==3.1.2",
         "sphinx-design",
@@ -159,8 +160,6 @@ class BuildError(Exception):
         "sphinx-autodoc-typehints",
         "myst_parser",
     ],
-    # see smartsim/_core/_install/buildenv.py for more details
-    **versions.ml_extras_required(),
 }
 
 
@@ -175,14 +174,16 @@ class BuildError(Exception):
         "redis>=4.5",
         "tqdm>=4.50.2",
         "filelock>=3.4.2",
-        "protobuf~=3.20",
+        "GitPython<=3.1.43",
+        "protobuf<=3.20.3",
         "jinja2>=3.1.2",
-        "watchdog>=4.0.0",
-        "pydantic==1.10.14",
+        "pycapnp==2.0.0",
+        "watchdog>4,<5",
+        "pydantic>2",
         "pyzmq>=25.1.2",
         "pygithub>=2.3.0",
         "numpy<2",
-        "smartredis>=0.5,<0.6",
+        "smartredis>=0.6,<0.7",
     ],
     zip_safe=False,
     extras_require=extras_require,
diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py
index 8e6f94722c..a0dc489f6a 100644
--- a/smartsim/_core/_cli/build.py
+++ b/smartsim/_core/_cli/build.py
@@ -25,23 +25,38 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
+import importlib.metadata
+import operator
 import os
-import platform
+from pathlib import Path
+import re
+import textwrap
 import typing as t
 
 from tabulate import tabulate
 
-from smartsim._core._cli.scripts.dragon_install import install_dragon
-from smartsim._core._cli.utils import SMART_LOGGER_FORMAT, pip
-from smartsim._core._install import builder
-from smartsim._core._install.buildenv import (
-    BuildEnv,
-    SetupError,
-    Version_,
-    VersionConflictError,
-    Versioner,
+from smartsim._core._cli.scripts.dragon_install import (
+    DEFAULT_DRAGON_REPO,
+    DEFAULT_DRAGON_VERSION,
+    DragonInstallRequest,
+    display_post_install_logs,
+    install_dragon,
 )
-from smartsim._core._install.builder import BuildError, Device
+from smartsim._core._cli.utils import SMART_LOGGER_FORMAT
+from smartsim._core._install.buildenv import BuildEnv, Version_, Versioner
+from smartsim._core._install.mlpackages import (
+    DEFAULT_MLPACKAGE_PATH,
+    DEFAULT_MLPACKAGES,
+    MLPackageCollection,
+    load_platform_configs,
+)
+from smartsim._core._install.platform import (
+    Architecture,
+    Device,
+    OperatingSystem,
+    Platform,
+)
+
 from smartsim._core.config import CONFIG
 from smartsim.log import get_logger
 
@@ -55,154 +70,66 @@
 # NOTE: all smartsim modules need full paths as the smart cli
 #       may be installed into a different directory.
 
-_TPinningStr = t.Literal["==", "!=", ">=", ">", "<=", "<", "~="]
-
-
-def check_py_onnx_version(versions: Versioner) -> None:
-    """Check Python environment for ONNX installation"""
-    _check_packages_in_python_env(
-        {
-            "onnx": Version_(versions.ONNX),
-            "skl2onnx": "1.16.0",
-            "onnxmltools": "1.12.0",
-            "scikit-learn": "1.3.2",
-        },
-    )
-
-
-def check_py_tf_version(versions: Versioner) -> None:
-    """Check Python environment for TensorFlow installation"""
-    _check_packages_in_python_env({"tensorflow": Version_(versions.TENSORFLOW)})
-
 
-def build_feature_store(build_env: BuildEnv, verbose: bool) -> None:
-    # check feature store installation
-    feature_store_builder = builder.FeatureStoreBuilder(
-        build_env(),
-        jobs=build_env.JOBS,
-        _os=builder.OperatingSystem.from_str(platform.system()),
-        architecture=builder.Architecture.from_str(platform.machine()),
-        malloc=build_env.MALLOC,
-        verbose=verbose,
-    )
-
-    if not feature_store_builder.is_built:
-        logger.info("No feature store is currently being built by 'smart build'")
-
-        feature_store_builder.cleanup()
-    logger.info("No feature store is currently being built by 'smart build'")
-
-
-def check_py_torch_version(versions: Versioner, device: Device = Device.CPU) -> None:
-    """Check Python environment for TensorFlow installation"""
-    if BuildEnv.is_macos():
-        if device == Device.GPU:
-            raise BuildError("SmartSim does not support GPU on MacOS")
-        device_suffix = ""
-    else:  # linux
-        if device == Device.CPU:
-            device_suffix = versions.TORCH_CPU_SUFFIX
-        elif device == Device.GPU:
-            device_suffix = versions.TORCH_CUDA_SUFFIX
-        else:
-            raise BuildError("Unrecognized device requested")
-
-    torch_deps = {
-        "torch": Version_(f"{versions.TORCH}{device_suffix}"),
-        "torchvision": Version_(f"{versions.TORCHVISION}{device_suffix}"),
+def parse_requirement(
+    requirement: str,
+) -> t.Tuple[str, t.Optional[str], t.Callable[[Version_], bool]]:
+    operators = {
+        "==": operator.eq,
+        "<=": operator.le,
+        ">=": operator.ge,
+        "<": operator.lt,
+        ">": operator.gt,
     }
-    missing, conflicts = _assess_python_env(
-        torch_deps,
-        package_pinning="==",
-        validate_installed_version=_create_torch_version_validator(
-            with_suffix=device_suffix
-        ),
+    semantic_version_pattern = r"\d+(?:\.\d+(?:\.\d+)?)?([^\s]*)"
+    pattern = (
+        r"^"  # Start
+        r"([a-zA-Z0-9_\-]+)"  # Package name
+        r"(?:\[[a-zA-Z0-9_\-,]+\])?"  # Any extras
+        r"(?:([<>=!~]{1,2})"  # Pinning string
+        rf"({semantic_version_pattern}))?"  # A version number
+        r"$"  # End
     )
-
-    if len(missing) == len(torch_deps) and not conflicts:
-        # All PyTorch deps are not installed and there are no conflicting
-        # python packages. We can try to install torch deps into the current env.
-        logger.info(
-            "Torch version not found in python environment. "
-            "Attempting to install via `pip`"
-        )
-        wheel_device = (
-            device.value if device == Device.CPU else device_suffix.replace("+", "")
-        )
-        pip(
-            "install",
-            "--extra-index-url",
-            f"https://download.pytorch.org/whl/{wheel_device}",
-            *(f"{package}=={version}" for package, version in torch_deps.items()),
-        )
-    elif missing or conflicts:
-        logger.warning(_format_incompatible_python_env_message(missing, conflicts))
-
-
-def _create_torch_version_validator(
-    with_suffix: str,
-) -> t.Callable[[str, t.Optional[Version_]], bool]:
-    def check_torch_version(package: str, version: t.Optional[Version_]) -> bool:
-        if not BuildEnv.check_installed(package, version):
-            return False
-        # Default check only looks at major/minor version numbers,
-        # Torch requires we look at the patch as well
-        installed = BuildEnv.get_py_package_version(package)
-        if with_suffix and with_suffix not in installed.patch:
-            raise VersionConflictError(
-                package,
-                installed,
-                version or Version_(f"X.X.X{with_suffix}"),
-                msg=(
-                    f"{package}=={installed} does not satisfy device "
-                    f"suffix requirement: {with_suffix}"
-                ),
+    match = re.match(pattern, requirement)
+    if match is None:
+        raise ValueError(f"Invalid requirement string: {requirement}")
+    module_name, cmp_op, version_str, suffix = match.groups()
+    version = Version_(version_str) if version_str is not None else None
+    if cmp_op is None:
+        is_compatible = lambda _: True  # pylint: disable=unnecessary-lambda-assignment
+    elif (cmp := operators.get(cmp_op, None)) is None:
+        raise ValueError(f"Unrecognized comparison operator: {cmp_op}")
+    else:
+
+        def is_compatible(other: Version_) -> bool:
+            assert version is not None  # For type check, always should be true
+            match_ = re.match(rf"^{semantic_version_pattern}$", other)
+            return (
+                cmp(other, version) and match_ is not None and match_.group(1) == suffix
             )
-        return True
-
-    return check_torch_version
-
-
-def _check_packages_in_python_env(
-    packages: t.Mapping[str, t.Optional[Version_]],
-    package_pinning: _TPinningStr = "==",
-    validate_installed_version: t.Optional[
-        t.Callable[[str, t.Optional[Version_]], bool]
-    ] = None,
-) -> None:
-    # TODO: Do not like how the default validation function will always look for
-    #       a `==` pinning. Maybe turn `BuildEnv.check_installed` into a factory
-    #       that takes a pinning and returns an appropriate validation fn?
-    validate_installed_version = validate_installed_version or BuildEnv.check_installed
-    missing, conflicts = _assess_python_env(
-        packages,
-        package_pinning,
-        validate_installed_version,
-    )
 
-    if missing or conflicts:
-        logger.warning(_format_incompatible_python_env_message(missing, conflicts))
+    return module_name, f"{cmp_op}{version}" if version else None, is_compatible
 
 
-def _assess_python_env(
-    packages: t.Mapping[str, t.Optional[Version_]],
-    package_pinning: _TPinningStr,
-    validate_installed_version: t.Callable[[str, t.Optional[Version_]], bool],
-) -> t.Tuple[t.List[str], t.List[str]]:
-    missing: t.List[str] = []
-    conflicts: t.List[str] = []
+def check_ml_python_packages(packages: MLPackageCollection) -> None:
+    missing = []
+    conflicts = []
 
-    for name, version in packages.items():
-        spec = f"{name}{package_pinning}{version}" if version else name
-        try:
-            if not validate_installed_version(name, version):
-                # Not installed!
-                missing.append(spec)
-        except VersionConflictError:
-            # Incompatible version found
-            conflicts.append(spec)
+    for package in packages.values():
+        for requirement in package.python_packages:
+            module_name, version_spec, is_compatible = parse_requirement(requirement)
+            try:
+                installed = BuildEnv.get_py_package_version(module_name)
+                if not is_compatible(installed):
+                    conflicts.append(
+                        f"{module_name}: {installed} is installed, "
+                        f"but {version_spec or 'Any'} is required"
+                    )
+            except importlib.metadata.PackageNotFoundError:
+                missing.append(module_name)
 
-    return missing, conflicts
+    if missing or conflicts:
+        logger.warning(_format_incompatible_python_env_message(missing, conflicts))
 
 
 def _format_incompatible_python_env_message(
@@ -215,13 +142,19 @@ def _format_incompatible_python_env_message(
     missing_str = fmt_list("Missing", missing)
     conflict_str = fmt_list("Conflicting", conflicting)
     sep = "\n" if missing_str and conflict_str else ""
-    return (
-        "Python Env Status Warning!\n"
-        "Requested Packages are Missing or Conflicting:\n\n"
-        f"{missing_str}{sep}{conflict_str}\n\n"
-        "Consider installing packages at the requested versions via `pip` or "
-        "uninstalling them, installing SmartSim with optional ML dependencies "
-        "(`pip install smartsim[ml]`), and running `smart clean && smart build ...`"
+
+    return textwrap.dedent(
+        f"""\
+        Python Package Warning:
+
+        Requested packages are missing or have a version mismatch with
+        their respective backend:
+
+        {missing_str}{sep}{conflict_str}
+
+        Consider uninstalling any conflicting packages and rerunning
+        `smart build` if you encounter issues.
+        """
     )
 
 
@@ -229,13 +162,30 @@ def _format_incompatible_python_env_message(
 def execute(
     args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, /
 ) -> int:
+
+    # Unpack various arguments
     verbose = args.v
     device = Device(args.device.lower())
     is_dragon_requested = args.dragon
-    # torch and tf build by default
-    pt = not args.no_pt  # pylint: disable=invalid-name
-    tf = not args.no_tf  # pylint: disable=invalid-name
-    onnx = args.onnx
+    dragon_repo = args.dragon_repo
+    dragon_version = args.dragon_version
+
+    # The user should never have to specify the OS and Architecture
+    current_platform = Platform(
+        OperatingSystem.autodetect(), Architecture.autodetect(), device
+    )
+
+    # Configure the ML Packages
+    configs = load_platform_configs(Path(args.config_dir))
+    mlpackages = configs[current_platform]
+
+    # Build all backends by default, pop off the ones that user wants skipped
+    if args.skip_torch and "libtorch" in mlpackages:
+        mlpackages.pop("libtorch")
+    if args.skip_tensorflow and "libtensorflow" in mlpackages:
+        mlpackages.pop("libtensorflow")
+    if args.skip_onnx and "onnxruntime" in mlpackages:
+        mlpackages.pop("onnxruntime")
 
     build_env = BuildEnv(checks=True)
     logger.info("Running SmartSim build process...")
@@ -257,41 +207,40 @@ def execute(
         version_names = list(vers.keys())
         print(tabulate(vers, headers=version_names, tablefmt="github"), "\n")
 
-    if is_dragon_requested:
+    logger.info("ML Packages")
+    print(mlpackages)
+
+    if is_dragon_requested or dragon_repo or dragon_version:
         install_to = CONFIG.core_path / ".dragon"
-        return_code = install_dragon(install_to)
+
+        try:
+            request = DragonInstallRequest(
+                install_to,
+                dragon_repo,
+                dragon_version,
+            )
+            return_code = install_dragon(request)
+        except ValueError as ex:
+            return_code = 2
+            logger.error(" ".join(ex.args))
 
         if return_code == 0:
-            logger.info("Dragon installation complete")
+            display_post_install_logs()
+
         elif return_code == 1:
             logger.info("Dragon installation not supported on platform")
         else:
             logger.warning("Dragon installation failed")
 
-    try:
-        if not args.only_python_packages:
-            ...
-
-    except (SetupError, BuildError) as e:
-        logger.error(str(e))
-        return os.EX_SOFTWARE
-
     backends = []
     backends_str = ", ".join(s.capitalize() for s in backends) if backends else "No"
-    logger.info(f"{backends_str} backend(s) built")
-
-    try:
-        # TODO: always installing torch, otherwise tests will fail.
-        # Should revert once torch install has been revamped
-        if "torch" in backends or True:
-            check_py_torch_version(versions, device)
-        if "tensorflow" in backends:
-            check_py_tf_version(versions)
-        if "onnxruntime" in backends:
-            check_py_onnx_version(versions)
-    except (SetupError, BuildError) as e:
-        logger.error(str(e))
-        return os.EX_SOFTWARE
+    logger.info(f"{backends_str} backend(s) available")
+
+    if not args.skip_python_packages:
+        for package in mlpackages.values():
+            logger.info(f"Installing python packages for {package.name}")
+            package.pip_install(quiet=not verbose)
+    check_ml_python_packages(mlpackages)
 
     logger.info("SmartSim build complete!")
     return os.EX_OK
@@ -299,7 +248,14 @@ def execute(
 
 def configure_parser(parser: argparse.ArgumentParser) -> None:
     """Builds the parser for the command"""
-    warn_usage = "(ONLY USE IF NEEDED)"
+
+    available_devices = []
+    for platform in DEFAULT_MLPACKAGES:
+        if (platform.operating_system == OperatingSystem.autodetect()) and (
+            platform.architecture == Architecture.autodetect()
+        ):
+            available_devices.append(platform.device.value)
+
     parser.add_argument(
         "-v",
         action="store_true",
@@ -310,7 +266,7 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
         "--device",
         type=str.lower,
         default=Device.CPU.value,
-        choices=[device.value for device in Device],
+        choices=available_devices,
         help="Device to build ML runtimes for",
     )
     parser.add_argument(
@@ -320,44 +276,48 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
         help="Install the dragon runtime",
     )
     parser.add_argument(
-        "--only_python_packages",
-        action="store_true",
-        default=False,
-        help="Only evaluate the python packages (i.e. skip building backends)",
+        "--dragon-repo",
+        default=None,
+        type=str,
+        help=(
+            "Specify a git repo containing dragon release assets "
+            f"(e.g. {DEFAULT_DRAGON_REPO})"
+        ),
+    )
+    parser.add_argument(
+        "--dragon-version",
+        default=None,
+        type=str,
+        help=f"Specify the dragon version to install (e.g. {DEFAULT_DRAGON_VERSION})",
     )
     parser.add_argument(
-        "--no_pt",
+        "--skip-python-packages",
         action="store_true",
-        default=False,
-        help="Do not build PyTorch backend",
+        help="Do not install the python packages that match the backends",
     )
     parser.add_argument(
-        "--no_tf",
+        "--skip-backends",
         action="store_true",
-        default=False,
-        help="Do not build TensorFlow backend",
+        help="Do not compile RedisAI and the backends",
     )
     parser.add_argument(
-        "--onnx",
+        "--skip-torch",
         action="store_true",
-        default=False,
-        help="Build ONNX backend (off by default)",
+        help="Do not build PyTorch backend",
     )
     parser.add_argument(
-        "--torch_dir",
-        default=None,
-        type=str,
-        help=f"Path to custom <path>/torch/share/cmake/Torch/ directory {warn_usage}",
+        "--skip-tensorflow",
+        action="store_true",
+        help="Do not build TensorFlow backend",
     )
     parser.add_argument(
-        "--libtensorflow_dir",
-        default=None,
-        type=str,
-        help=f"Path to custom libtensorflow directory {warn_usage}",
+        "--skip-onnx",
+        action="store_true",
+        help="Do not build the ONNX backend",
     )
     parser.add_argument(
-        "--no_torch_with_mkl",
-        dest="torch_with_mkl",
-        action="store_false",
-        help="Do not build Torch with Intel MKL",
+        "--config-dir",
+        default=str(DEFAULT_MLPACKAGE_PATH),
+        type=str,
+        help="Path to directory with JSON files describing platform and packages",
     )
diff --git a/smartsim/_core/_cli/info.py b/smartsim/_core/_cli/info.py
index ec50e151aa..7fa094fbdc 100644
--- a/smartsim/_core/_cli/info.py
+++ b/smartsim/_core/_cli/info.py
@@ -6,9 +6,7 @@
 
 from tabulate import tabulate
 
-import smartsim._core._cli.utils as _utils
 import smartsim._core.utils.helpers as _helpers
-from smartsim._core._cli.scripts.dragon_install import dragon_pin
 from smartsim._core._install.buildenv import BuildEnv as _BuildEnv
 
 _MISSING_DEP = _helpers.colorize("Not Installed", "red")
@@ -30,7 +28,8 @@ def execute(
     )
 
     print("Dragon Installation:")
-    dragon_version = dragon_pin()
+    # TODO: Fix hardcoded dragon version
+    dragon_version = "0.10"
 
     fs_table = [["Version", str(dragon_version)]]
     print(tabulate(fs_table, tablefmt="fancy_outline"), end="\n\n")
diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index a2e8ed36ff..7a7d75f1d2 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -1,13 +1,19 @@
 import os
 import pathlib
+import re
+import shutil
 import sys
 import typing as t
+from urllib.request import Request, urlopen
 
 from github import Github
+from github.Auth import Token
+from github.GitRelease import GitRelease
 from github.GitReleaseAsset import GitReleaseAsset
+from github.Repository import Repository
 
 from smartsim._core._cli.utils import pip
-from smartsim._core._install.builder import WebTGZ
+from smartsim._core._install.utils import retrieve
 from smartsim._core.config import CONFIG
 from smartsim._core.utils.helpers import check_platform, is_crayex_platform
 from smartsim.error.errors import SmartSimCLIActionCancelled
@@ -15,20 +21,90 @@
 
 logger = get_logger(__name__)
 
+DEFAULT_DRAGON_REPO = "DragonHPC/dragon"
+DEFAULT_DRAGON_VERSION = "0.10"
+DEFAULT_DRAGON_VERSION_TAG = f"v{DEFAULT_DRAGON_VERSION}"
+_GH_TOKEN = "SMARTSIM_DRAGON_TOKEN"
 
-def create_dotenv(dragon_root_dir: pathlib.Path) -> None:
+
+class DragonInstallRequest:
+    """Encapsulates a request to install the dragon package"""
+
+    def __init__(
+        self,
+        working_dir: pathlib.Path,
+        repo_name: t.Optional[str] = None,
+        version: t.Optional[str] = None,
+    ) -> None:
+        """Initialize an install request.
+
+        :param working_dir: A path to store temporary files used during installation
+        :param repo_name: The name of a repository to install from, e.g. DragonHPC/dragon
+        :param version: The version to install, e.g. v0.10
+        """
+
+        self.working_dir = working_dir
+        """A path to store temporary files used during installation"""
+
+        self.repo_name = repo_name or DEFAULT_DRAGON_REPO
+        """The name of a repository to install from, e.g. DragonHPC/dragon"""
+
+        self.pkg_version = version or DEFAULT_DRAGON_VERSION
+        """The version to install, e.g. 0.10"""
+
+        self._check()
+
+    def _check(self) -> None:
+        """Perform validation of this instance
+
+        :raises ValueError: if any value fails validation"""
+        if not self.repo_name or len(self.repo_name.split("/")) != 2:
+            raise ValueError(
+                f"Invalid dragon repository name. Example: `dragonhpc/dragon`"
+            )
+
+        # version must match standard dragon tag & filename format `vX.YZ`
+        match = re.match(r"^\d\.\d+$", self.pkg_version)
+        if not self.pkg_version or not match:
+            raise ValueError("Invalid dragon version. Examples: `0.9, 0.91, 0.10`")
+
+        # attempting to retrieve from a non-default repository requires an auth token
+        if self.repo_name.lower() != DEFAULT_DRAGON_REPO.lower() and not self.raw_token:
+            raise ValueError(
+                f"An access token must be available to access {self.repo_name}. "
+                f"Set the `{_GH_TOKEN}` env var to pass your access token."
+            )
+
+    @property
+    def raw_token(self) -> t.Optional[str]:
+        """Returns the raw access token from the environment, if available"""
+        return os.environ.get(_GH_TOKEN, None)
+
+
+def get_auth_token(request: DragonInstallRequest) -> t.Optional[Token]:
+    """Create a Github.Auth.Token if an access token can be found
+    in the environment
+
+    :param request: details of a request for the installation of the dragon package
+    :returns: an auth token if one can be built, otherwise `None`"""
+    if gh_token := request.raw_token:
+        return Token(gh_token)
+    return None
+
+
+def create_dotenv(dragon_root_dir: pathlib.Path, dragon_version: str) -> None:
     """Create a .env file with required environment variables for the Dragon runtime"""
     dragon_root = str(dragon_root_dir)
-    dragon_inc_dir = str(dragon_root_dir / "include")
-    dragon_lib_dir = str(dragon_root_dir / "lib")
-    dragon_bin_dir = str(dragon_root_dir / "bin")
+    dragon_inc_dir = dragon_root + "/include"
+    dragon_lib_dir = dragon_root + "/lib"
+    dragon_bin_dir = dragon_root + "/bin"
 
     dragon_vars = {
         "DRAGON_BASE_DIR": dragon_root,
-        "DRAGON_ROOT_DIR": dragon_root,  # note: same as base_dir
+        "DRAGON_ROOT_DIR": dragon_root,
         "DRAGON_INCLUDE_DIR": dragon_inc_dir,
         "DRAGON_LIB_DIR": dragon_lib_dir,
-        "DRAGON_VERSION": dragon_pin(),
+        "DRAGON_VERSION": dragon_version,
         "PATH": dragon_bin_dir,
         "LD_LIBRARY_PATH": dragon_lib_dir,
     }
@@ -48,12 +124,6 @@ def python_version() -> str:
     return f"py{sys.version_info.major}.{sys.version_info.minor}"
 
 
-def dragon_pin() -> str:
-    """Return a string indicating the pinned major/minor version of the dragon
-    package to install"""
-    return "0.9"
-
-
 def _platform_filter(asset_name: str) -> bool:
     """Return True if the asset name matches naming standard for current
     platform (Cray or non-Cray). Otherwise, returns False.
@@ -75,67 +145,125 @@ def _version_filter(asset_name: str) -> bool:
     return python_version() in asset_name
 
 
-def _pin_filter(asset_name: str) -> bool:
+def _pin_filter(asset_name: str, dragon_version: str) -> bool:
     """Return true if the supplied value contains a dragon version pin match
 
-    :param asset_name: A value to inspect for keywords indicating a dragon version
+    :param asset_name: the asset name to inspect for keywords indicating a dragon version
+    :param dragon_version: the dragon version to match
     :returns: True if supplied value is correct for current dragon version"""
-    return f"dragon-{dragon_pin()}" in asset_name
+    return f"dragon-{dragon_version}" in asset_name
+
+
+def _get_all_releases(dragon_repo: Repository) -> t.Collection[GitRelease]:
+    """Retrieve all available releases for the configured dragon repository
+
+    :param dragon_repo: A GitHub repository object for the dragon package
+    :returns: A list of GitRelease"""
+    all_releases = [release for release in list(dragon_repo.get_releases())]
+    return all_releases
 
 
-def _get_release_assets() -> t.Collection[GitReleaseAsset]:
+def _get_release_assets(request: DragonInstallRequest) -> t.Collection[GitReleaseAsset]:
     """Retrieve a collection of available assets for all releases that satisfy
     the dragon version pin
 
+    :param request: details of a request for the installation of the dragon package
     :returns: A collection of release assets"""
-    git = Github()
-
-    dragon_repo = git.get_repo("DragonHPC/dragon")
+    auth = get_auth_token(request)
+    git = Github(auth=auth)
+    dragon_repo = git.get_repo(request.repo_name)
 
     if dragon_repo is None:
         raise SmartSimCLIActionCancelled("Unable to locate dragon repo")
 
-    # find any releases matching our pinned version requirement
-    tags = [tag for tag in dragon_repo.get_tags() if dragon_pin() in tag.name]
-    # repo.get_latest_release fails if only pre-release results are returned
-    pin_releases = list(dragon_repo.get_release(tag.name) for tag in tags)
-    releases = sorted(pin_releases, key=lambda r: r.published_at, reverse=True)
+    all_releases = sorted(
+        _get_all_releases(dragon_repo), key=lambda r: r.published_at, reverse=True
+    )
 
-    # take the most recent release for the given pin
-    assets = releases[0].assets
+    # filter the list of releases to include only the target version
+    releases = [
+        release
+        for release in all_releases
+        if request.pkg_version in release.title or release.tag_name
+    ]
+
+    releases = sorted(releases, key=lambda r: r.published_at, reverse=True)
+
+    if not releases:
+        release_titles = ", ".join(release.title for release in all_releases)
+        raise SmartSimCLIActionCancelled(
+            f"Unable to find a release for dragon version {request.pkg_version}. "
+            f"Available releases: {release_titles}"
+        )
+
+    assets: t.List[GitReleaseAsset] = []
+
+    # install the latest release of the target version (including pre-release)
+    for release in releases:
+        # delay in attaching release assets may leave us with an empty list, retry
+        # with the next available release
+        if assets := list(release.get_assets()):
+            logger.debug(f"Found assets for dragon release {release.title}")
+            break
+        else:
+            logger.debug(f"No assets for dragon release {release.title}. Retrying.")
+
+    if not assets:
+        raise SmartSimCLIActionCancelled(
+            f"Unable to find assets for dragon release {release.title}"
+        )
 
     return assets
 
 
-def filter_assets(assets: t.Collection[GitReleaseAsset]) -> t.Optional[GitReleaseAsset]:
+def filter_assets(
+    request: DragonInstallRequest, assets: t.Collection[GitReleaseAsset]
+) -> t.Optional[GitReleaseAsset]:
     """Filter the available release assets so that HSTA agents are used
     when run on a Cray EX platform
 
+    :param request: details of a request for the installation of the dragon package
     :param assets: The collection of dragon release assets to filter
     :returns: An asset meeting platform & version filtering requirements"""
     # Expect cray & non-cray assets that require a filter, e.g.
     # 'dragon-0.8-py3.9.4.1-bafaa887f.tar.gz',
     # 'dragon-0.8-py3.9.4.1-CRAYEX-ac132fe95.tar.gz'
-    asset = next(
-        (
-            asset
-            for asset in assets
-            if _version_filter(asset.name)
-            and _platform_filter(asset.name)
-            and _pin_filter(asset.name)
-        ),
-        None,
+    all_assets = [asset.name for asset in assets]
+
+    assets = list(
+        asset
+        for asset in assets
+        if _version_filter(asset.name) and _pin_filter(asset.name, request.pkg_version)
     )
+
+    if len(assets) == 0:
+        available = "\n\t".join(all_assets)
+        logger.warning(
+            f"Please specify a dragon version (e.g. {DEFAULT_DRAGON_VERSION}) "
+            f"of an asset available in the repository:\n\t{available}"
+        )
+        return None
+
+    asset: t.Optional[GitReleaseAsset] = None
+
+    # Apply platform filter if we have multiple matches for python/dragon version
+    if len(assets) > 0:
+        asset = next((asset for asset in assets if _platform_filter(asset.name)), None)
+
+    if not asset:
+        asset = assets[0]
+        logger.warning(f"Platform-specific package not found. Using {asset.name}")
+
     return asset
 
 
-def retrieve_asset_info() -> GitReleaseAsset:
+def retrieve_asset_info(request: DragonInstallRequest) -> GitReleaseAsset:
     """Find a release asset that meets all necessary filtering criteria
 
-    :param dragon_pin: identify the dragon version to install (e.g. dragon-0.8)
+    :param request: details of a request for the installation of the dragon package
     :returns: A GitHub release asset"""
-    assets = _get_release_assets()
-    asset = filter_assets(assets)
+    assets = _get_release_assets(request)
+    asset = filter_assets(request, assets)
 
     platform_result = check_platform()
     if not platform_result.is_cray:
@@ -150,43 +278,79 @@ def retrieve_asset_info() -> GitReleaseAsset:
     return asset
 
 
-def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib.Path:
+def retrieve_asset(
+    request: DragonInstallRequest, asset: GitReleaseAsset
+) -> pathlib.Path:
     """Retrieve the physical file associated to a given GitHub release asset
 
-    :param working_dir: location in file system where assets should be written
+    :param request: details of a request for the installation of the dragon package
     :param asset: GitHub release asset to retrieve
-    :returns: path to the downloaded asset"""
-    if working_dir.exists() and list(working_dir.rglob("*.whl")):
-        return working_dir
+    :returns: path to the directory containing the extracted release asset
+    :raises SmartSimCLIActionCancelled: if the asset cannot be downloaded or extracted
+    """
+    download_dir = request.working_dir / str(asset.id)
+
+    # if we've previously downloaded the release and still have
+    # wheels laying around, use that cached version instead
+    cleanup(download_dir)
+    download_dir.mkdir(parents=True, exist_ok=True)
+
+    # grab a copy of the complete asset
+    asset_path = download_dir / str(asset.name)
+
+    # use the asset URL instead of the browser_download_url to enable
+    # using auth for private repositories
+    headers: t.Dict[str, str] = {"Accept": "application/octet-stream"}
+
+    if request.raw_token:
+        headers["Authorization"] = f"Bearer {request.raw_token}"
+
+    try:
+        # a github asset endpoint causes a redirect. the first request
+        # receives a pre-signed URL to the asset to pass on to retrieve
+        dl_request = Request(asset.url, headers=headers)
+        response = urlopen(dl_request)
+        presigned_url = response.url
+
+        logger.debug(f"Retrieved asset {asset.name} metadata from {asset.url}")
+    except Exception:
+        logger.exception(f"Unable to download {asset.name} from: {asset.url}")
+        presigned_url = asset.url
+
+    # extract the asset
+    try:
+        retrieve(presigned_url, asset_path)
 
-    archive = WebTGZ(asset.browser_download_url)
-    archive.extract(working_dir)
+        logger.debug(f"Extracted {asset.name} to {download_dir}")
+    except Exception as ex:
+        raise SmartSimCLIActionCancelled(
+            f"Unable to extract {asset.name} from {download_dir}"
+        ) from ex
 
-    logger.debug(f"Retrieved {asset.browser_download_url} to {working_dir}")
-    return working_dir
+    return download_dir
 
 
-def install_package(asset_dir: pathlib.Path) -> int:
+def install_package(request: DragonInstallRequest, asset_dir: pathlib.Path) -> int:
     """Install the package found in `asset_dir` into the current python environment
 
-    :param asset_dir: path to a decompressed archive contents for a release asset"""
-    wheels = asset_dir.rglob("*.whl")
-    wheel_path = next(wheels, None)
-    if not wheel_path:
-        logger.error(f"No wheel found for package in {asset_dir}")
+    :param request: details of a request for the installation of the dragon package
+    :param asset_dir: path to a decompressed archive contents for a release asset
+    :returns: Integer return code, 0 for success, non-zero on failures"""
+    found_wheels = list(asset_dir.rglob("*.whl"))
+    if not found_wheels:
+        logger.error(f"No wheel(s) found for package in {asset_dir}")
         return 1
 
-    create_dotenv(wheel_path.parent)
-
-    while wheel_path is not None:
-        logger.info(f"Installing package: {wheel_path.absolute()}")
+    create_dotenv(found_wheels[0].parent, request.pkg_version)
 
-        try:
-            pip("install", "--force-reinstall", str(wheel_path), "numpy<2")
-            wheel_path = next(wheels, None)
-        except Exception:
-            logger.error(f"Unable to install from {asset_dir}")
-            return 1
+    try:
+        wheels = list(map(str, found_wheels))
+        for wheel_path in wheels:
+            logger.info(f"Installing package: {wheel_path}")
+            pip("install", wheel_path)
+    except Exception:
+        logger.error(f"Unable to install from {asset_dir}")
+        return 1
 
     return 0
 
@@ -197,36 +361,83 @@ def cleanup(
     """Delete the downloaded asset and any files extracted during installation
 
     :param archive_path: path to a downloaded archive for a release asset"""
-    if archive_path:
-        archive_path.unlink(missing_ok=True)
-        logger.debug(f"Deleted archive: {archive_path}")
+    if not archive_path:
+        return
+
+    if archive_path.exists() and archive_path.is_file():
+        archive_path.unlink()
+        archive_path = archive_path.parent
 
+    if archive_path.exists() and archive_path.is_dir():
+        shutil.rmtree(archive_path, ignore_errors=True)
+        logger.debug(f"Deleted temporary files in: {archive_path}")
 
-def install_dragon(extraction_dir: t.Union[str, os.PathLike[str]]) -> int:
+
+def install_dragon(request: DragonInstallRequest) -> int:
     """Retrieve a dragon runtime appropriate for the current platform
     and install to the current python environment
-    :param extraction_dir: path for download and extraction of assets
+
+    :param request: details of a request for the installation of the dragon package
     :returns: Integer return code, 0 for success, non-zero on failures"""
     if sys.platform == "darwin":
         logger.debug(f"Dragon not supported on platform: {sys.platform}")
         return 1
 
-    extraction_dir = pathlib.Path(extraction_dir)
-    filename: t.Optional[pathlib.Path] = None
     asset_dir: t.Optional[pathlib.Path] = None
 
     try:
-        asset_info = retrieve_asset_info()
-        asset_dir = retrieve_asset(extraction_dir, asset_info)
+        asset_info = retrieve_asset_info(request)
+        if asset_info is not None:
+            asset_dir = retrieve_asset(request, asset_info)
+            return install_package(request, asset_dir)
 
-        return install_package(asset_dir)
+    except SmartSimCLIActionCancelled as ex:
+        logger.warning(*ex.args)
     except Exception as ex:
-        logger.error("Unable to install dragon runtime", exc_info=ex)
-    finally:
-        cleanup(filename)
+        logger.error("Unable to install dragon runtime", exc_info=True)
 
     return 2
 
 
+def display_post_install_logs() -> None:
+    """Display post-installation instructions for the user"""
+
+    examples = {
+        "ofi-include": "/opt/cray/include",
+        "ofi-build-lib": "/opt/cray/lib64",
+        "ofi-runtime-lib": "/opt/cray/lib64",
+    }
+
+    config = ":".join(f"{k}={v}" for k, v in examples.items())
+    example_msg1 = f"dragon-config -a \\"
+    example_msg2 = f'    "{config}"'
+
+    logger.info(
+        "************************** Dragon Package Installed *****************************"
+    )
+    logger.info("To enable Dragon to use HSTA (default: TCP), configure the following:")
+
+    for key in examples:
+        logger.info(f"\t{key}")
+
+    logger.info("Example:")
+    logger.info(example_msg1)
+    logger.info(example_msg2)
+    logger.info(
+        "*********************************************************************************"
+    )
+
+
 if __name__ == "__main__":
-    sys.exit(install_dragon(CONFIG.core_path / ".dragon"))
+    # path for download and extraction of assets
+    extraction_dir = CONFIG.core_path / ".dragon"
+    dragon_repo = DEFAULT_DRAGON_REPO
+    dragon_version = DEFAULT_DRAGON_VERSION
+
+    request = DragonInstallRequest(
+        extraction_dir,
+        dragon_repo,
+        dragon_version,
+    )
+
+    sys.exit(install_dragon(request))
diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py
index 16b6ec4ea8..a87642e49f 100644
--- a/smartsim/_core/_cli/validate.py
+++ b/smartsim/_core/_cli/validate.py
@@ -33,7 +33,7 @@
 from types import TracebackType
 
 from smartsim._core._cli.utils import SMART_LOGGER_FORMAT
-from smartsim._core._install.builder import Device
+from smartsim._core._install.platform import Device
 from smartsim.log import get_logger
 
 logger = get_logger("Smart", fmt=SMART_LOGGER_FORMAT)
@@ -69,7 +69,9 @@ def __exit__(
             self._finalizer.detach()  # type: ignore[attr-defined]
 
 
-def execute(args: argparse.Namespace) -> int:
+def execute(
+    args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None
+) -> int:
     """Validate the SmartSim installation works as expected given a
     simple experiment
     """
diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py
index ca52520695..552f9e28b0 100644
--- a/smartsim/_core/_install/buildenv.py
+++ b/smartsim/_core/_install/buildenv.py
@@ -53,30 +53,6 @@ class SetupError(Exception):
     """
 
 
-class VersionConflictError(SetupError):
-    """An error for when version numbers of some library/package/program/etc
-    do not match and build may not be able to continue
-    """
-
-    def __init__(
-        self,
-        name: str,
-        current_version: "Version_",
-        target_version: "Version_",
-        msg: t.Optional[str] = None,
-    ) -> None:
-        if msg is None:
-            msg = (
-                f"Incompatible version for {name} detected: "
-                f"{name} {target_version} requested but {name} {current_version} "
-                "installed."
-            )
-        super().__init__(msg)
-        self.name = name
-        self.current_version = current_version
-        self.target_version = target_version
-
-
 # so as to not conflict with pkg_resources.packaging.version.Version
 # pylint: disable-next=invalid-name
 class Version_(str):
@@ -183,58 +159,29 @@ class Versioner:
     PYTHON_MIN = Version_("3.9.0")
 
     # Versions
-    SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.7.0"))
+    SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.8.0"))
     SMARTSIM_SUFFIX = get_env("SMARTSIM_SUFFIX", "")
 
-    # ML/DL
-    # torch can be set by the user because we download that for them
-    TORCH = Version_(get_env("SMARTSIM_TORCH", "2.0.1"))
-    TORCHVISION = Version_(get_env("SMARTSIM_TORCHVIS", "0.15.2"))
-    TORCH_CPU_SUFFIX = Version_(get_env("TORCH_CPU_SUFFIX", "+cpu"))
-    TORCH_CUDA_SUFFIX = Version_(get_env("TORCH_CUDA_SUFFIX", "+cu117"))
-
-    # TensorFlow and ONNX only use the defaults
+    # Redis
+    REDIS = Version_(get_env("SMARTSIM_REDIS", "7.2.4"))
+    REDIS_URL = get_env("SMARTSIM_REDIS_URL", "https://github.com/redis/redis.git")
+    REDIS_BRANCH = get_env("SMARTSIM_REDIS_BRANCH", REDIS)
 
-    TENSORFLOW = Version_("2.13.1")
-    ONNX = Version_("1.14.1")
+    # RedisAI
+    REDISAI = "1.2.7"
+    REDISAI_URL = get_env(
+        "SMARTSIM_REDISAI_URL", "https://github.com/RedisAI/RedisAI.git"
+    )
+    REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}")
 
-    def as_dict(self) -> t.Dict[str, t.Tuple[str, ...]]:
+    def as_dict(self, db_name: DbEngine = "REDIS") -> t.Dict[str, t.Tuple[str, ...]]:
         pkg_map = {
             "SMARTSIM": self.SMARTSIM,
-            "TORCH": self.TORCH,
-            "TENSORFLOW": self.TENSORFLOW,
-            "ONNX": self.ONNX,
+            db_name: self.REDIS,
+            "REDISAI": self.REDISAI,
         }
         return {"Packages": tuple(pkg_map), "Versions": tuple(pkg_map.values())}
 
-    # TODO add a backend for ml libraries
-    def ml_extras_required(self) -> t.Dict[str, t.List[str]]:
-        """Optional ML/DL dependencies we suggest for the user."""
-        ml_defaults = {
-            "torch": self.TORCH,
-            "tensorflow": self.TENSORFLOW,
-            "onnx": self.ONNX,
-            "skl2onnx": "1.16.0",
-            "onnxmltools": "1.12.0",
-            "scikit-learn": "1.3.2",
-            "torchvision": "0.15.2",
-            "torch_cpu_suffix": "+cpu",
-            "torch_cuda_suffix": "+cu117",
-        }
-
-        # remove torch-related fields as they are subject to change
-        # by having the user change hardware (cpu/gpu)
-        _torch_fields = [
-            "torch",
-            "torchvision",
-            "torch_cpu_suffix",
-            "torch_cuda_suffix",
-        ]
-        for field in _torch_fields:
-            ml_defaults.pop(field)
-
-        return {"ml": [f"{lib}=={vers}" for lib, vers in ml_defaults.items()]}
-
     @staticmethod
     def get_sha(setup_py_dir: Path) -> str:
         """Get the git sha of the current branch"""
@@ -304,7 +251,7 @@ def __init__(self, checks: bool = True) -> None:
             self.check_dependencies()
 
     def check_dependencies(self) -> None:
-        deps = ["git", "git-lfs", "make", "wget", "cmake", self.CC, self.CXX]
+        deps = ["git", "make", "wget", "cmake", self.CC, self.CXX]
         if int(self.CHECKS) == 0:
             for dep in deps:
                 self.check_build_dependency(dep)
@@ -417,23 +364,6 @@ def check_build_dependency(command: str) -> None:
         except OSError:
             raise SetupError(f"{command} must be installed to build SmartSim") from None
 
-    @classmethod
-    def check_installed(
-        cls, package: str, version: t.Optional[Version_] = None
-    ) -> bool:
-        """Check if a package is installed. If version is provided, check if
-        it's a compatible version. (major and minor the same)
-        """
-        try:
-            installed = cls.get_py_package_version(package)
-        except importlib.metadata.PackageNotFoundError:
-            return False
-        if version:
-            # detect if major or minor versions differ
-            if installed.major != version.major or installed.minor != version.minor:
-                raise VersionConflictError(package, installed, version)
-        return True
-
     @staticmethod
     def get_py_package_version(package: str) -> Version_:
         return Version_(importlib.metadata.version(package))
diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py
index 8cda07ede5..a1a4cb93b5 100644
--- a/smartsim/_core/_install/builder.py
+++ b/smartsim/_core/_install/builder.py
@@ -26,28 +26,15 @@
 
 # pylint: disable=too-many-lines
 
-import enum
-import fileinput
-import itertools
 import os
-import platform
 import re
 import shutil
 import stat
 import subprocess
-import tarfile
-import tempfile
 import typing as t
-import urllib.request
-import zipfile
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
 from pathlib import Path
-from shutil import which
 from subprocess import SubprocessError
 
-# NOTE: This will be imported by setup.py and hence no smartsim related
-# items should be imported into this file.
 
 # TODO: check cmake version and use system if possible to avoid conflicts
 
@@ -56,66 +43,10 @@
 _U = t.TypeVar("_U")
 
 
-def expand_exe_path(exe: str) -> str:
-    """Takes an executable and returns the full path to that executable
-
-    :param exe: executable or file
-    :raises TypeError: if file is not an executable
-    :raises FileNotFoundError: if executable cannot be found
-    """
-
-    # which returns none if not found
-    in_path = which(exe)
-    if not in_path:
-        if os.path.isfile(exe) and os.access(exe, os.X_OK):
-            return os.path.abspath(exe)
-        if os.path.isfile(exe) and not os.access(exe, os.X_OK):
-            raise TypeError(f"File, {exe}, is not an executable")
-        raise FileNotFoundError(f"Could not locate executable {exe}")
-    return os.path.abspath(in_path)
-
-
 class BuildError(Exception):
     pass
 
 
-class Architecture(enum.Enum):
-    X64 = ("x86_64", "amd64")
-    ARM64 = ("arm64",)
-
-    @classmethod
-    def from_str(cls, string: str, /) -> "Architecture":
-        string = string.lower()
-        for type_ in cls:
-            if string in type_.value:
-                return type_
-        raise BuildError(f"Unrecognized or unsupported architecture: {string}")
-
-
-class Device(enum.Enum):
-    CPU = "cpu"
-    GPU = "gpu"
-
-
-class OperatingSystem(enum.Enum):
-    LINUX = ("linux", "linux2")
-    DARWIN = ("darwin",)
-
-    @classmethod
-    def from_str(cls, string: str, /) -> "OperatingSystem":
-        string = string.lower()
-        for type_ in cls:
-            if string in type_.value:
-                return type_
-        raise BuildError(f"Unrecognized or unsupported operating system: {string}")
-
-
-class Platform(t.NamedTuple):
-    os: OperatingSystem
-    architecture: Architecture
-
-
-# TODO: Add FeatureStoreBuilder member
 class Builder:
     """Base class for building third-party libraries"""
 
@@ -133,13 +64,10 @@ def __init__(
         self,
         env: t.Dict[str, str],
         jobs: int = 1,
-        _os: OperatingSystem = OperatingSystem.from_str(platform.system()),
-        architecture: Architecture = Architecture.from_str(platform.machine()),
         verbose: bool = False,
     ) -> None:
         # build environment from buildenv
         self.env = env
-        self._platform = Platform(_os, architecture)
 
         # Find _core directory and set up paths
         _core_dir = Path(os.path.abspath(__file__)).parent.parent
@@ -174,11 +102,6 @@ def out(self) -> t.Optional[int]:
     def is_built(self) -> bool:
         raise NotImplementedError
 
-    def build_from_git(
-        self, git_url: str, branch: str, device: Device = Device.CPU
-    ) -> None:
-        raise NotImplementedError
-
     @staticmethod
     def binary_path(binary: str) -> str:
         binary_ = shutil.which(binary)
@@ -239,281 +162,3 @@ def run_command(
                 raise BuildError(error)
         except (OSError, SubprocessError) as e:
             raise BuildError(e) from e
-
-
-class _WebLocation(ABC):
-    @property
-    @abstractmethod
-    def url(self) -> str: ...
-
-
-class _WebGitRepository(_WebLocation):
-    def clone(
-        self,
-        target: _PathLike,
-        depth: t.Optional[int] = None,
-        branch: t.Optional[str] = None,
-    ) -> None:
-        depth_ = ("--depth", str(depth)) if depth is not None else ()
-        branch_ = ("--branch", branch) if branch is not None else ()
-        _git("clone", "-q", *depth_, *branch_, self.url, os.fspath(target))
-
-
-@t.final
-@dataclass(frozen=True)
-class _DLPackRepository(_WebGitRepository):
-    version: str
-
-    @staticmethod
-    def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
-        return (
-            (OperatingSystem.LINUX, Architecture.X64),
-            (OperatingSystem.DARWIN, Architecture.X64),
-            (OperatingSystem.DARWIN, Architecture.ARM64),
-        )
-
-    @property
-    def url(self) -> str:
-        return ""
-
-
-class _WebArchive(_WebLocation):
-    @property
-    def name(self) -> str:
-        _, name = self.url.rsplit("/", 1)
-        return name
-
-    def download(self, target: _PathLike) -> Path:
-        target = Path(target)
-        if target.is_dir():
-            target = target / self.name
-        file, _ = urllib.request.urlretrieve(self.url, target)
-        return Path(file).resolve()
-
-
-class _ExtractableWebArchive(_WebArchive, ABC):
-    @abstractmethod
-    def _extract_download(self, download_path: Path, target: _PathLike) -> None: ...
-
-    def extract(self, target: _PathLike) -> None:
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            arch_path = self.download(tmp_dir)
-            self._extract_download(arch_path, target)
-
-
-class _WebTGZ(_ExtractableWebArchive):
-    def _extract_download(self, download_path: Path, target: _PathLike) -> None:
-        with tarfile.open(download_path, "r") as tgz_file:
-            tgz_file.extractall(target)
-
-
-class _WebZip(_ExtractableWebArchive):
-    def _extract_download(self, download_path: Path, target: _PathLike) -> None:
-        with zipfile.ZipFile(download_path, "r") as zip_file:
-            zip_file.extractall(target)
-
-
-class WebTGZ(_WebTGZ):
-    def __init__(self, url: str) -> None:
-        self._url = url
-
-    @property
-    def url(self) -> str:
-        return self._url
-
-
-@dataclass(frozen=True)
-class _PTArchive(_WebZip):
-    architecture: Architecture
-    device: Device
-    version: str
-    with_mkl: bool
-
-    @staticmethod
-    def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
-        # TODO: This will need to be revisited if the inheritance tree gets deeper
-        return tuple(
-            itertools.chain.from_iterable(
-                var.supported_platforms() for var in _PTArchive.__subclasses__()
-            )
-        )
-
-    @staticmethod
-    def _patch_out_mkl(libtorch_root: Path) -> None:
-        _modify_source_files(
-            libtorch_root / "share/cmake/Caffe2/public/mkl.cmake",
-            r"find_package\(MKL QUIET\)",
-            "# find_package(MKL QUIET)",
-        )
-
-    def extract(self, target: _PathLike) -> None:
-        super().extract(target)
-        if not self.with_mkl:
-            self._patch_out_mkl(Path(target))
-
-
-@t.final
-class _PTArchiveLinux(_PTArchive):
-    @staticmethod
-    def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
-        return ((OperatingSystem.LINUX, Architecture.X64),)
-
-    @property
-    def url(self) -> str:
-        if self.device == Device.GPU:
-            pt_build = "cu117"
-        else:
-            pt_build = Device.CPU.value
-        # pylint: disable-next=line-too-long
-        libtorch_archive = (
-            f"libtorch-cxx11-abi-shared-without-deps-{self.version}%2B{pt_build}.zip"
-        )
-        return f"https://download.pytorch.org/libtorch/{pt_build}/{libtorch_archive}"
-
-
-@t.final
-class _PTArchiveMacOSX(_PTArchive):
-    @staticmethod
-    def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
-        return (
-            (OperatingSystem.DARWIN, Architecture.ARM64),
-            (OperatingSystem.DARWIN, Architecture.X64),
-        )
-
-    @property
-    def url(self) -> str:
-        if self.architecture == Architecture.X64:
-            pt_build = Device.CPU.value
-            libtorch_archive = f"libtorch-macos-{self.version}.zip"
-            root_url = "https://download.pytorch.org/libtorch"
-            return f"{root_url}/{pt_build}/{libtorch_archive}"
-        if self.architecture == Architecture.ARM64:
-            libtorch_archive = f"libtorch-macos-arm64-{self.version}.zip"
-            # pylint: disable-next=line-too-long
-            root_url = (
-                "https://github.com/CrayLabs/ml_lib_builder/releases/download/v0.1/"
-            )
-            return f"{root_url}/{libtorch_archive}"
-
-        raise BuildError(f"Unsupported architecture for Pytorch: {self.architecture}")
-
-
-def _choose_pt_variant(
-    os_: OperatingSystem,
-) -> t.Union[t.Type[_PTArchiveLinux], t.Type[_PTArchiveMacOSX]]:
-    if os_ == OperatingSystem.DARWIN:
-        return _PTArchiveMacOSX
-    if os_ == OperatingSystem.LINUX:
-        return _PTArchiveLinux
-
-    raise BuildError(f"Unsupported OS for PyTorch: {os_}")
-
-
-@t.final
-@dataclass(frozen=True)
-class _TFArchive(_WebTGZ):
-    os_: OperatingSystem
-    architecture: Architecture
-    device: Device
-    version: str
-
-    @staticmethod
-    def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
-        return (
-            (OperatingSystem.LINUX, Architecture.X64),
-            (OperatingSystem.DARWIN, Architecture.X64),
-        )
-
-    @property
-    def url(self) -> str:
-        if self.architecture == Architecture.X64:
-            tf_arch = "x86_64"
-        else:
-            raise BuildError(
-                f"Unexpected Architecture for TF Archive: {self.architecture}"
-            )
-
-        if self.os_ == OperatingSystem.LINUX:
-            tf_os = "linux"
-            tf_device = self.device
-        elif self.os_ == OperatingSystem.DARWIN:
-            tf_os = "darwin"
-            tf_device = Device.CPU
-        else:
-            raise BuildError(f"Unexpected OS for TF Archive: {self.os_}")
-        return (
-            "https://storage.googleapis.com/tensorflow/libtensorflow/"
-            f"libtensorflow-{tf_device.value}-{tf_os}-{tf_arch}-{self.version}.tar.gz"
-        )
-
-
-@t.final
-@dataclass(frozen=True)
-class _ORTArchive(_WebTGZ):
-    os_: OperatingSystem
-    device: Device
-    version: str
-
-    @staticmethod
-    def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
-        return (
-            (OperatingSystem.LINUX, Architecture.X64),
-            (OperatingSystem.DARWIN, Architecture.X64),
-        )
-
-    @property
-    def url(self) -> str:
-        ort_url_base = (
-            "https://github.com/microsoft/onnxruntime/releases/"
-            f"download/v{self.version}"
-        )
-        if self.os_ == OperatingSystem.LINUX:
-            ort_os = "linux"
-            ort_arch = "x64"
-            ort_build = "-gpu" if self.device == Device.GPU else ""
-        elif self.os_ == OperatingSystem.DARWIN:
-            ort_os = "osx"
-            ort_arch = "x86_64"
-            ort_build = ""
-        else:
-            raise BuildError(f"Unexpected OS for TF Archive: {self.os_}")
-        ort_archive = f"onnxruntime-{ort_os}-{ort_arch}{ort_build}-{self.version}.tgz"
-        return f"{ort_url_base}/{ort_archive}"
-
-
-def _git(*args: str) -> None:
-    git = Builder.binary_path("git")
-    cmd = (git,) + args
-    with subprocess.Popen(cmd) as proc:
-        proc.wait()
-        if proc.returncode != 0:
-            raise BuildError(
-                f"Command `{' '.join(cmd)}` failed with exit code {proc.returncode}"
-            )
-
-
-def config_git_command(plat: Platform, cmd: t.Sequence[str]) -> t.List[str]:
-    """Modify git commands to include autocrlf when on a platform that needs
-    autocrlf enabled to behave correctly
-    """
-    cmd = list(cmd)
-    where = next((i for i, tok in enumerate(cmd) if tok.endswith("git")), len(cmd)) + 2
-    if where >= len(cmd):
-        raise ValueError(f"Failed to locate git command in '{' '.join(cmd)}'")
-    if plat == Platform(OperatingSystem.DARWIN, Architecture.ARM64):
-        cmd = (
-            cmd[:where]
-            + ["--config", "core.autocrlf=false", "--config", "core.eol=lf"]
-            + cmd[where:]
-        )
-    return cmd
-
-
-def _modify_source_files(
-    files: t.Union[_PathLike, t.Iterable[_PathLike]], regex: str, replacement: str
-) -> None:
-    compiled_regex = re.compile(regex)
-    with fileinput.input(files=files, inplace=True) as handles:
-        for line in handles:
-            line = compiled_regex.sub(replacement, line)
-            print(line, end="")
diff --git a/smartsim/_core/_install/configs/mlpackages/DarwinARM64CPU.json b/smartsim/_core/_install/configs/mlpackages/DarwinARM64CPU.json
new file mode 100644
index 0000000000..5109cf376c
--- /dev/null
+++ b/smartsim/_core/_install/configs/mlpackages/DarwinARM64CPU.json
@@ -0,0 +1,59 @@
+{
+    "platform": {
+        "operating_system":"darwin",
+        "architecture":"arm64",
+        "device":"cpu"
+    },
+    "ml_packages": [
+        {
+            "name": "dlpack",
+            "version": "v0.5_RAI",
+            "pip_index": "",
+            "python_packages": [],
+            "lib_source": "https://github.com/RedisAI/dlpack.git"
+        },
+        {
+            "name": "libtorch",
+            "version": "2.4.0",
+            "pip_index": "",
+            "python_packages": [
+                "torch==2.4.0",
+                "torchvision==0.19.0",
+                "torchaudio==2.4.0"
+            ],
+            "lib_source": "https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.4.0.zip",
+            "rai_patches": [
+                {
+                    "description": "Patch RedisAI module to require C++17 standard instead of C++14",
+                    "source_file": "src/backends/libtorch_c/CMakeLists.txt",
+                    "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)",
+                    "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input inputs",
+                    "replacement": "TF_Output inputs"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input port",
+                    "replacement": "TF_Output port"
+                }
+            ]
+        },
+        {
+            "name": "onnxruntime",
+            "version": "1.17.3",
+            "pip_index": "",
+            "python_packages": [
+                "onnx==1.15",
+                "skl2onnx",
+                "scikit-learn",
+                "onnxmltools"
+            ],
+            "lib_source": "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-osx-arm64-1.17.3.tgz"
+        }
+    ]
+}
diff --git a/smartsim/_core/_install/configs/mlpackages/DarwinX64CPU.json b/smartsim/_core/_install/configs/mlpackages/DarwinX64CPU.json
new file mode 100644
index 0000000000..06e30cbf8b
--- /dev/null
+++ b/smartsim/_core/_install/configs/mlpackages/DarwinX64CPU.json
@@ -0,0 +1,68 @@
+{
+    "platform": {
+        "operating_system":"darwin",
+        "architecture":"x86_64",
+        "device":"cpu"
+    },
+    "ml_packages": [
+        {
+            "name": "dlpack",
+            "version": "v0.5_RAI",
+            "pip_index": "",
+            "python_packages": [],
+            "lib_source": "https://github.com/RedisAI/dlpack.git"
+        },
+        {
+            "name": "libtorch",
+            "version": "2.2.2",
+            "pip_index": "",
+            "python_packages": [
+                "torch==2.2.2",
+                "torchvision==0.17.2",
+                "torchaudio==2.2.2"
+            ],
+            "lib_source": "https://download.pytorch.org/libtorch/cpu/libtorch-macos-x86_64-2.2.2.zip",
+            "rai_patches": [
+                {
+                    "description": "Patch RedisAI module to require C++17 standard instead of C++14",
+                    "source_file": "src/backends/libtorch_c/CMakeLists.txt",
+                    "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)",
+                    "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input inputs",
+                    "replacement": "TF_Output inputs"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input port",
+                    "replacement": "TF_Output port"
+                }
+            ]
+        },
+        {
+            "name": "libtensorflow",
+            "version": "2.15",
+            "pip_index": "",
+            "python_packages": [
+                "tensorflow==2.15"
+            ],
+            "lib_source": "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-darwin-x86_64-2.15.0.tar.gz"
+        },
+        {
+            "name": "onnxruntime",
+            "version": "1.17.3",
+            "pip_index": "",
+            "python_packages": [
+                "onnx==1.15",
+                "skl2onnx",
+                "scikit-learn",
+                "onnxmltools"
+            ],
+            "lib_source": "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-osx-x86_64-1.17.3.tgz"
+        }
+    ]
+}
diff --git a/smartsim/_core/_install/configs/mlpackages/LinuxX64CPU.json b/smartsim/_core/_install/configs/mlpackages/LinuxX64CPU.json
new file mode 100644
index 0000000000..2b1224df46
--- /dev/null
+++ b/smartsim/_core/_install/configs/mlpackages/LinuxX64CPU.json
@@ -0,0 +1,68 @@
+{
+    "platform": {
+        "operating_system":"linux",
+        "architecture":"x86_64",
+        "device":"cpu"
+    },
+    "ml_packages": [
+        {
+            "name": "dlpack",
+            "version": "v0.5_RAI",
+            "pip_index": "",
+            "python_packages": [],
+            "lib_source": "https://github.com/RedisAI/dlpack.git"
+        },
+        {
+            "name": "libtorch",
+            "version": "2.4.0",
+            "pip_index": "https://download.pytorch.org/whl/cpu",
+            "python_packages": [
+                "torch==2.4.0+cpu",
+                "torchvision==0.19.0+cpu",
+                "torchaudio==2.4.0+cpu"
+            ],
+            "lib_source": "https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.4.0%2Bcpu.zip",
+            "rai_patches": [
+                {
+                    "description": "Patch RedisAI module to require C++17 standard instead of C++14",
+                    "source_file": "src/backends/libtorch_c/CMakeLists.txt",
+                    "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)",
+                    "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input inputs",
+                    "replacement": "TF_Output inputs"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input port",
+                    "replacement": "TF_Output port"
+                }
+            ]
+        },
+        {
+            "name": "libtensorflow",
+            "version": "2.15",
+            "pip_index": "",
+            "python_packages": [
+                "tensorflow==2.15"
+            ],
+            "lib_source": "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-linux-x86_64-2.15.0.tar.gz"
+        },
+        {
+            "name": "onnxruntime",
+            "version": "1.17.3",
+            "pip_index": "",
+            "python_packages": [
+                "onnx<=1.15",
+                "skl2onnx",
+                "scikit-learn",
+                "onnxmltools"
+            ],
+            "lib_source": "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-1.17.3.tgz"
+        }
+    ]
+}
diff --git a/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA11.json b/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA11.json
new file mode 100644
index 0000000000..30d9cbf516
--- /dev/null
+++ b/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA11.json
@@ -0,0 +1,68 @@
+{
+    "platform": {
+        "operating_system":"linux",
+        "architecture":"x86_64",
+        "device":"cuda-11"
+    },
+    "ml_packages": [
+        {
+            "name": "dlpack",
+            "version": "v0.5_RAI",
+            "pip_index": "",
+            "python_packages": [],
+            "lib_source": "https://github.com/RedisAI/dlpack.git"
+        },
+        {
+            "name": "libtorch",
+            "version": "2.3.1",
+            "pip_index": "https://download.pytorch.org/whl/cu118",
+            "python_packages": [
+                "torch==2.3.1+cu118",
+                "torchvision==0.18.1+cu118",
+                "torchaudio==2.3.1+cu118"
+            ],
+            "lib_source": "https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.3.1%2Bcu118.zip",
+            "rai_patches": [
+                {
+                    "description": "Patch RedisAI module to require C++17 standard instead of C++14",
+                    "source_file": "src/backends/libtorch_c/CMakeLists.txt",
+                    "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)",
+                    "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input inputs",
+                    "replacement": "TF_Output inputs"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input port",
+                    "replacement": "TF_Output port"
+                }
+            ]
+        },
+        {
+            "name": "libtensorflow",
+            "version": "2.14.1",
+            "pip_index": "",
+            "python_packages": [
+                "tensorflow==2.14.1"
+            ],
+            "lib_source": "https://github.com/CrayLabs/ml_lib_builder/releases/download/v0.2/libtensorflow-2.14.1-linux-x64-cuda-11.8.0.tgz"
+        },
+        {
+            "name": "onnxruntime",
+            "version": "1.17.3",
+            "pip_index": "",
+            "python_packages": [
+                "onnx==1.15",
+                "skl2onnx",
+                "scikit-learn",
+                "onnxmltools"
+            ],
+            "lib_source": "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-gpu-1.17.3.tgz"
+        }
+    ]
+}
diff --git a/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA12.json b/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA12.json
new file mode 100644
index 0000000000..a8bf330b4f
--- /dev/null
+++ b/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA12.json
@@ -0,0 +1,76 @@
+{
+    "platform": {
+        "operating_system":"linux",
+        "architecture":"x86_64",
+        "device":"cuda-12"
+    },
+    "ml_packages": [
+        {
+            "name": "dlpack",
+            "version": "v0.5_RAI",
+            "pip_index": "",
+            "python_packages": [],
+            "lib_source": "https://github.com/RedisAI/dlpack.git"
+        },
+        {
+            "name": "libtorch",
+            "version": "2.3.1",
+            "pip_index": "https://download.pytorch.org/whl/cu121",
+            "python_packages": [
+                "torch==2.3.1+cu121",
+                "torchvision==0.18.1+cu121",
+                "torchaudio==2.3.1+cu121"
+            ],
+            "lib_source": "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.3.1%2Bcu121.zip",
+            "rai_patches": [
+                {
+                    "description": "Patch RedisAI module to require C++17 standard instead of C++14",
+                    "source_file": "src/backends/libtorch_c/CMakeLists.txt",
+                    "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)",
+                    "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input inputs",
+                    "replacement": "TF_Output inputs"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input port",
+                    "replacement": "TF_Output port"
+                }
+            ]
+        },
+        {
+            "name": "libtensorflow",
+            "version": "2.15",
+            "pip_index": "",
+            "python_packages": [
+                "tensorflow==2.15"
+            ],
+            "lib_source": "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-2.15.0.tar.gz",
+            "rai_patches": [
+                {
+                    "description": "Patch RedisAI to point to correct tsl directory",
+                    "source_file": "CMakeLists.txt",
+                    "regex": "INCLUDE_DIRECTORIES\\(\\$\\{depsAbs\\}/libtensorflow/include\\)",
+                    "replacement": "INCLUDE_DIRECTORIES(${depsAbs}/libtensorflow/include ${depsAbs}/libtensorflow/include/external/local_tsl)"
+                }
+            ]
+        },
+        {
+            "name": "onnxruntime",
+            "version": "1.17.3",
+            "pip_index": "",
+            "python_packages": [
+                "onnx==1.15",
+                "skl2onnx",
+                "scikit-learn",
+                "onnxmltools"
+            ],
+            "lib_source": "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-gpu-cuda12-1.17.3.tgz"
+        }
+    ]
+}
diff --git a/smartsim/_core/_install/configs/mlpackages/LinuxX64ROCM6.json b/smartsim/_core/_install/configs/mlpackages/LinuxX64ROCM6.json
new file mode 100644
index 0000000000..ba3c9a0bfb
--- /dev/null
+++ b/smartsim/_core/_install/configs/mlpackages/LinuxX64ROCM6.json
@@ -0,0 +1,59 @@
+{
+    "platform": {
+        "operating_system":"linux",
+        "architecture":"x86_64",
+        "device":"rocm-6"
+    },
+    "ml_packages": [
+        {
+            "name": "dlpack",
+            "version": "v0.5_RAI",
+            "pip_index": "",
+            "python_packages": [],
+            "lib_source": "https://github.com/RedisAI/dlpack.git"
+        },
+        {
+            "name": "libtorch",
+            "version": "2.4.0",
+            "pip_index": "https://download.pytorch.org/whl/rocm6.1",
+            "python_packages": [
+                "torch==2.4.0+rocm6.1",
+                "torchvision==0.19.0+rocm6.1",
+                "torchaudio==2.4.0+rocm6.1"
+            ],
+            "lib_source": "https://download.pytorch.org/libtorch/rocm6.1/libtorch-cxx11-abi-shared-with-deps-2.4.1%2Brocm6.1.zip",
+            "rai_patches": [
+                {
+                    "description": "Patch RedisAI module to require C++17 standard instead of C++14",
+                    "source_file": "src/backends/libtorch_c/CMakeLists.txt",
+                    "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)",
+                    "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)"
+                },
+                {
+                    "description": "Fix Regex, Load HIP",
+                    "source_file": "../package/libtorch/share/cmake/Caffe2/public/LoadHIP.cmake",
+                    "regex": ".*string.*",
+                    "replacement": ""
+                },
+                {
+                    "description": "Replace `/opt/rocm` with `$ENV{ROCM_PATH}`",
+                    "source_file": "../package/libtorch/share/cmake/Caffe2/Caffe2Targets.cmake",
+                    "regex": "/opt/rocm",
+                    "replacement": "$ENV{ROCM_PATH}"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input inputs",
+                    "replacement": "TF_Output inputs"
+                },
+                {
+                    "description": "Fix the type in a Tensorflow function signature",
+                    "source_file": "src/backends/tensorflow.c",
+                    "regex": "TF_Input port",
+                    "replacement": "TF_Output port"
+                }
+            ]
+        }
+    ]
+}
diff --git a/smartsim/_core/_install/mlpackages.py b/smartsim/_core/_install/mlpackages.py
new file mode 100644
index 0000000000..04e3798d35
--- /dev/null
+++ b/smartsim/_core/_install/mlpackages.py
@@ -0,0 +1,198 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import os
+import pathlib
+import re
+import subprocess
+import sys
+import typing as t
+from collections.abc import MutableMapping
+from dataclasses import dataclass
+
+from tabulate import tabulate
+
+from .platform import Platform
+from .types import PathLike
+from .utils import retrieve
+
+
+class RequireRelativePath(Exception):
+    pass
+
+
+@dataclass
+class RAIPatch:
+    """Holds information about how to patch a RedisAI source file
+
+    :param description: Human-readable description of the patch's purpose
+    :param replacement: "The replacement for the line found by the regex"
+    :param source_file: A relative path to the chosen file
+    :param regex: A regex pattern to match in the given file
+
+    """
+
+    description: str
+    replacement: str
+    source_file: pathlib.Path
+    regex: re.Pattern[str]
+
+    def __post_init__(self) -> None:
+        self.source_file = pathlib.Path(self.source_file)
+        self.regex = re.compile(self.regex)
+
+
+@dataclass
+class MLPackage:
+    """Describes the python and C/C++ library for an ML package"""
+
+    name: str
+    version: str
+    pip_index: str
+    python_packages: t.List[str]
+    lib_source: PathLike
+    rai_patches: t.Tuple[RAIPatch, ...] = ()
+
+    def retrieve(self, destination: PathLike) -> None:
+        """Retrieve an archive and/or repository for the package
+
+        :param destination: Path to place the extracted package or repository
+        """
+        retrieve(self.lib_source, pathlib.Path(destination))
+
+    def pip_install(self, quiet: bool = False) -> None:
+        """Install associated python packages
+
+        :param quiet: If True, suppress most of the pip output, defaults to False
+        """
+        if self.python_packages:
+            install_command = [sys.executable, "-m", "pip", "install"]
+            if self.pip_index:
+                install_command += ["--index-url", self.pip_index]
+            if quiet:
+                install_command += ["--quiet", "--no-warn-conflicts"]
+            install_command += self.python_packages
+            subprocess.check_call(install_command)
+
+
+class MLPackageCollection(MutableMapping[str, MLPackage]):
+    """Collects multiple MLPackages
+
+    Define a collection of MLPackages available for a specific platform
+    """
+
+    def __init__(self, platform: Platform, ml_packages: t.Sequence[MLPackage]):
+        self.platform = platform
+        self._ml_packages = {pkg.name: pkg for pkg in ml_packages}
+
+    @classmethod
+    def from_json_file(cls, json_file: PathLike) -> "MLPackageCollection":
+        """Create an MLPackageCollection specified from a JSON file
+
+        :param json_file: path to the JSON file
+        :return: An instance of MLPackageCollection for a platform
+        """
+        with open(json_file, "r", encoding="utf-8") as file_handle:
+            config_json = json.load(file_handle)
+        platform = Platform.from_strs(**config_json["platform"])
+
+        for ml_package in config_json["ml_packages"]:
+            # Convert the dictionary representation to a RAIPatch
+            if "rai_patches" in ml_package:
+                patch_list = ml_package.pop("rai_patches")
+                ml_package["rai_patches"] = [RAIPatch(**patch) for patch in patch_list]
+
+        ml_packages = [
+            MLPackage(**ml_package) for ml_package in config_json["ml_packages"]
+        ]
+        return cls(platform, ml_packages)
+
+    def __iter__(self) -> t.Iterator[str]:
+        """Iterate over the mlpackages in the collection
+
+        :return: Iterator over mlpackages
+        """
+        return iter(self._ml_packages)
+
+    def __getitem__(self, key: str) -> MLPackage:
+        """Retrieve an MLPackage based on its name
+
+        :param key: Name of the python package (e.g. libtorch)
+        :return: MLPackage with all requirements
+        """
+        return self._ml_packages[key]
+
+    def __len__(self) -> int:
+        return len(self._ml_packages)
+
+    def __delitem__(self, key: str) -> None:
+        del self._ml_packages[key]
+
+    def __setitem__(self, key: t.Any, value: t.Any) -> t.NoReturn:
+        raise TypeError(f"{type(self).__name__} does not support item assignment")
+
+    def __contains__(self, key: object) -> bool:
+        return key in self._ml_packages
+
+    def __str__(self, tablefmt: str = "github") -> str:
+        """Display package names and versions as a table
+
+        :param tablefmt: Tabulate format, defaults to "github"
+        """
+
+        return tabulate(
+            [[k, v.version] for k, v in self._ml_packages.items()],
+            headers=["Package", "Version"],
+            tablefmt=tablefmt,
+        )
+
+
+def load_platform_configs(
+    config_file_path: pathlib.Path,
+) -> t.Dict[Platform, MLPackageCollection]:
+    """Create MLPackageCollections from JSON files in directory
+
+    :param config_file_path: Directory with JSON files describing the
+                             configuration by platform
+    :return: Dictionary whose keys are the supported platform and values
+             are its associated MLPackageCollection
+    """
+    if not config_file_path.is_dir():
+        path = os.fspath(config_file_path)
+        msg = f"Platform configuration directory `{path}` does not exist"
+        raise FileNotFoundError(msg)
+    configs = {}
+    for config_file in config_file_path.glob("*.json"):
+        dependencies = MLPackageCollection.from_json_file(config_file)
+        configs[dependencies.platform] = dependencies
+    return configs
+
+
+DEFAULT_MLPACKAGE_PATH: t.Final = (
+    pathlib.Path(__file__).parent / "configs" / "mlpackages"
+)
+DEFAULT_MLPACKAGES: t.Final = load_platform_configs(DEFAULT_MLPACKAGE_PATH)
diff --git a/smartsim/_core/_install/platform.py b/smartsim/_core/_install/platform.py
new file mode 100644
index 0000000000..bef13c6a0a
--- /dev/null
+++ b/smartsim/_core/_install/platform.py
@@ -0,0 +1,226 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import enum
+import json
+import os
+import pathlib
+import platform
+import typing as t
+from dataclasses import dataclass
+
+from typing_extensions import Self
+
+
+class PlatformError(Exception):
+    pass
+
+
+class UnsupportedError(PlatformError):
+    pass
+
+
+class Architecture(enum.Enum):
+    """Identifiers for supported CPU architectures
+
+    :return: An enum representing the CPU architecture
+    """
+
+    X64 = "x86_64"
+    ARM64 = "arm64"
+
+    @classmethod
+    def from_str(cls, string: str) -> "Architecture":
+        """Return enum associated with the architecture
+
+        :param string: String representing the architecture, see platform.machine
+        :return: Enum for a specific architecture
+        """
+        string = string.lower()
+        return cls(string)
+
+    @classmethod
+    def autodetect(cls) -> "Architecture":
+        """Automatically return the architecture of the current machine
+
+        :return: enum of this platform's architecture
+        """
+        return cls.from_str(platform.machine())
+
+
+class Device(enum.Enum):
+    """Identifiers for the device stack
+
+    :return: Enum associated with the device stack
+    """
+
+    CPU = "cpu"
+    CUDA11 = "cuda-11"
+    CUDA12 = "cuda-12"
+    ROCM5 = "rocm-5"
+    ROCM6 = "rocm-6"
+
+    @classmethod
+    def from_str(cls, str_: str) -> "Device":
+        """Return enum associated with the device
+
+        :param string: String representing the device and version
+        :return: Enum for a specific device
+        """
+        str_ = str_.lower()
+        if str_ == "gpu":
+            # TODO: auto detect which device to use
+            #       currently hard coded to `cuda11`
+            return cls.CUDA11
+        return cls(str_)
+
+    @classmethod
+    def detect_cuda_version(cls) -> t.Optional["Device"]:
+        """Find the enum based on environment CUDA
+
+        :return: Enum for the version of CUDA currently available
+        """
+        if cuda_home := os.environ.get("CUDA_HOME"):
+            cuda_path = pathlib.Path(cuda_home)
+            with open(cuda_path / "version.json", "r", encoding="utf-8") as file_handle:
+                cuda_versions = json.load(file_handle)
+            major = cuda_versions["cuda"]["version"].split(".")[0]
+            return cls.from_str(f"cuda-{major}")
+        return None
+
+    @classmethod
+    def detect_rocm_version(cls) -> t.Optional["Device"]:
+        """Find the enum based on environment ROCm
+
+        :return: Enum for the version of ROCm currently available
+        """
+        if rocm_home := os.environ.get("ROCM_HOME"):
+            rocm_path = pathlib.Path(rocm_home)
+            fname = rocm_path / ".info" / "version"
+            with open(fname, "r", encoding="utf-8") as file_handle:
+                major = file_handle.readline().split("-")[0].split(".")[0]
+            return cls.from_str(f"rocm-{major}")
+        return None
+
+    def is_gpu(self) -> bool:
+        """Whether the enum is categorized as a GPU
+
+        :return: True if GPU
+        """
+        return self != type(self).CPU
+
+    def is_cuda(self) -> bool:
+        """Whether the enum is associated with a CUDA device
+
+        :return: True for any supported CUDA enums
+        """
+        cls = type(self)
+        return self in cls.cuda_enums()
+
+    def is_rocm(self) -> bool:
+        """Whether the enum is associated with a ROCm device
+
+        :return: True for any supported ROCm enums
+        """
+        cls = type(self)
+        return self in cls.rocm_enums()
+
+    @classmethod
+    def cuda_enums(cls) -> t.Tuple["Device", ...]:
+        """Detect all CUDA devices supported by SmartSim
+
+        :return: all enums associated with CUDA
+        """
+        return tuple(device for device in cls if "cuda" in device.value)
+
+    @classmethod
+    def rocm_enums(cls) -> t.Tuple["Device", ...]:
+        """Detect all ROCm devices supported by SmartSim
+
+        :return: all enums associated with ROCm
+        """
+        return tuple(device for device in cls if "rocm" in device.value)
+
+
+class OperatingSystem(enum.Enum):
+    """Enum for all supported operating systems"""
+
+    LINUX = "linux"
+    DARWIN = "darwin"
+
+    @classmethod
+    def from_str(cls, string: str, /) -> "OperatingSystem":
+        """Return enum associated with the OS
+
+        :param string: String representing the OS
+        :return: Enum for a specific OS
+        """
+        string = string.lower()
+        return cls(string)
+
+    @classmethod
+    def autodetect(cls) -> "OperatingSystem":
+        """Automatically return the OS of the current machine
+
+        :return: enum of this platform's OS
+        """
+        return cls.from_str(platform.system())
+
+
+@dataclass(frozen=True)
+class Platform:
+    """Container describing relevant identifiers for a platform"""
+
+    operating_system: OperatingSystem
+    architecture: Architecture
+    device: Device
+
+    @classmethod
+    def from_strs(cls, operating_system: str, architecture: str, device: str) -> Self:
+        """Factory method for Platform from string onput
+
+        :param os: String identifier for the OS
+        :param architecture: String identifier for the architecture
+        :param device: String identifer for the device and version
+        :return: Instance of Platform
+        """
+        return cls(
+            OperatingSystem.from_str(operating_system),
+            Architecture.from_str(architecture),
+            Device.from_str(device),
+        )
+
+    def __str__(self) -> str:
+        """Human-readable representation of Platform
+
+        :return: String created from the values of the enums for each property
+        """
+        output = [
+            self.operating_system.name,
+            self.architecture.name,
+            self.device.name,
+        ]
+        return "-".join(output)
diff --git a/smartsim/_core/_install/redisaiBuilder.py b/smartsim/_core/_install/redisaiBuilder.py
new file mode 100644
index 0000000000..1dce6ddb45
--- /dev/null
+++ b/smartsim/_core/_install/redisaiBuilder.py
@@ -0,0 +1,301 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import fileinput
+import os
+import pathlib
+import shutil
+import stat
+import subprocess
+import typing as t
+from collections import deque
+
+from smartsim._core._cli.utils import SMART_LOGGER_FORMAT
+from smartsim._core._install.buildenv import BuildEnv
+from smartsim._core._install.mlpackages import MLPackageCollection, RAIPatch
+from smartsim._core._install.platform import OperatingSystem, Platform
+from smartsim._core._install.utils import retrieve
+from smartsim._core.config import CONFIG
+from smartsim.log import get_logger
+
+logger = get_logger("Smart", fmt=SMART_LOGGER_FORMAT)
+_SUPPORTED_ROCM_ARCH = "gfx90a"
+
+
+class RedisAIBuildError(Exception):
+    pass
+
+
+class RedisAIBuilder:
+    """Class to build RedisAI from Source"""
+
+    def __init__(
+        self,
+        platform: Platform,
+        mlpackages: MLPackageCollection,
+        build_env: BuildEnv,
+        main_build_path: pathlib.Path,
+        verbose: bool = False,
+        source: t.Union[str, pathlib.Path] = "https://github.com/RedisAI/RedisAI.git",
+        version: str = "v1.2.7",
+    ) -> None:
+
+        self.platform = platform
+        self.mlpackages = mlpackages
+        self.build_env = build_env
+        self.verbose = verbose
+        self.source = source
+        self.version = version
+        self._root_path = main_build_path / "RedisAI"
+
+        self.cleanup_build()
+
+    @property
+    def src_path(self) -> pathlib.Path:
+        return pathlib.Path(self._root_path / "src")
+
+    @property
+    def build_path(self) -> pathlib.Path:
+        return pathlib.Path(self._root_path / "build")
+
+    @property
+    def package_path(self) -> pathlib.Path:
+        return pathlib.Path(self._root_path / "package")
+
+    def cleanup_build(self) -> None:
+        """Removes all directories associated with the build"""
+        shutil.rmtree(self.src_path, ignore_errors=True)
+        shutil.rmtree(self.build_path, ignore_errors=True)
+        shutil.rmtree(self.package_path, ignore_errors=True)
+
+    @property
+    def is_built(self) -> bool:
+        """Determine whether RedisAI and backends were built
+
+        :return: True if all backends and RedisAI module are in
+                 the expected location
+        """
+        backend_dir = CONFIG.lib_path / "backends"
+        rai_exists = [
+            (backend_dir / f"redisai_{backend_name}").is_dir()
+            for backend_name in self.mlpackages
+        ]
+        rai_exists.append((CONFIG.lib_path / "redisai.so").is_file())
+        return all(rai_exists)
+
+    @property
+    def build_torch(self) -> bool:
+        """Whether to build torch backend
+
+        :return: True if torch backend should be built
+        """
+        return "libtorch" in self.mlpackages
+
+    @property
+    def build_tensorflow(self) -> bool:
+        """Whether to build tensorflow backend
+
+        :return: True if tensorflow backend should be built
+        """
+        return "libtensorflow" in self.mlpackages
+
+    @property
+    def build_onnxruntime(self) -> bool:
+        """Whether to build onnx backend
+
+        :return: True if onnx backend should be built
+        """
+        return "onnxruntime" in self.mlpackages
+
+    def build(self) -> None:
+        """Build RedisAI
+
+        :param git_url: url from which to retrieve RedisAI
+        :param branch: branch to checkout
+        :param device: cpu or gpu
+        """
+
+        # Following is needed to make sure that the clone/checkout is not
+        # impeded by git LFS limits imposed by RedisAI
+        os.environ["GIT_LFS_SKIP_SMUDGE"] = "1"
+
+        self.src_path.mkdir(parents=True)
+        self.build_path.mkdir(parents=True)
+        self.package_path.mkdir(parents=True)
+
+        retrieve(self.source, self.src_path, depth=1, branch=self.version)
+
+        self._prepare_packages()
+
+        for package in self.mlpackages.values():
+            self._patch_source_files(package.rai_patches)
+        cmake_command = self._rai_cmake_cmd()
+        build_command = self._rai_build_cmd
+
+        if self.platform.device.is_rocm() and "libtorch" in self.mlpackages:
+            pytorch_rocm_arch = os.environ.get("PYTORCH_ROCM_ARCH")
+            if not pytorch_rocm_arch:
+                logger.info(
+                    f"PYTORCH_ROCM_ARCH not set. Defaulting to '{_SUPPORTED_ROCM_ARCH}'"
+                )
+                os.environ["PYTORCH_ROCM_ARCH"] = _SUPPORTED_ROCM_ARCH
+            elif pytorch_rocm_arch != _SUPPORTED_ROCM_ARCH:
+                logger.warning(
+                    f"PYTORCH_ROCM_ARCH is not {_SUPPORTED_ROCM_ARCH} which is the "
+                    "only officially supported architecture. This may still work "
+                    "if you are supplying your own version of libtensorflow."
+                )
+
+        logger.info("Configuring CMake Build")
+        if self.verbose:
+            print(" ".join(cmake_command))
+        self.run_command(cmake_command, self.build_path)
+
+        logger.info("Building RedisAI")
+        if self.verbose:
+            print(" ".join(build_command))
+        self.run_command(build_command, self.build_path)
+
+        if self.platform.operating_system == OperatingSystem.LINUX:
+            self._set_execute(CONFIG.lib_path / "redisai.so")
+
+    @staticmethod
+    def _set_execute(target: pathlib.Path) -> None:
+        """Set execute permissions for file
+
+        :param target: The target file to add execute permission
+        """
+        permissions = os.stat(target).st_mode | stat.S_IXUSR
+        os.chmod(target, permissions)
+
+    @staticmethod
+    def _find_closest_object(
+        start_path: pathlib.Path, target_obj: str
+    ) -> t.Optional[pathlib.Path]:
+        queue = deque([start_path])
+        while queue:
+            current_dir = queue.popleft()
+            current_target = current_dir / target_obj
+            if current_target.exists():
+                return current_target.parent
+            for sub_dir in current_dir.iterdir():
+                if sub_dir.is_dir():
+                    queue.append(sub_dir)
+        return None
+
+    def _prepare_packages(self) -> None:
+        """Ensure that retrieved archives/packages are in the expected location
+
+        RedisAI requires that the root directory of the backend is at
+        DEP_PATH/example_backend. Due to difficulties in retrieval methods and
+        naming conventions from different sources, this cannot be standardized.
+        Instead we try to find the parent of the "include" directory and assume
+        this is the root.
+        """
+
+        for package in self.mlpackages.values():
+            logger.info(f"Retrieving package: {package.name} {package.version}")
+            target_dir = self.package_path / package.name
+            package.retrieve(target_dir)
+            # Move actual contents to root of the expected location
+            actual_root = self._find_closest_object(target_dir, "include")
+            if actual_root and actual_root != target_dir:
+                logger.debug(
+                    (
+                        "Non-standard location found: \n",
+                        f"{actual_root} -> {target_dir}",
+                    )
+                )
+                for file in actual_root.iterdir():
+                    file.rename(target_dir / file.name)
+
+    def run_command(self, cmd: t.Union[str, t.List[str]], cwd: pathlib.Path) -> None:
+        """Executor of commands usedi in the build
+
+        :param cmd: The actual command to execute
+        :param cwd: The working directory to execute in
+        """
+        stdout = None if self.verbose else subprocess.DEVNULL
+        stderr = None if self.verbose else subprocess.PIPE
+        proc = subprocess.run(
+            cmd, cwd=str(cwd), stdout=stdout, stderr=stderr, check=False
+        )
+        if proc.returncode != 0:
+            if stderr:
+                print(proc.stderr.decode("utf-8"))
+            raise RedisAIBuildError(
+                f"RedisAI build failed during command: {' '.join(cmd)}"
+            )
+
+    def _rai_cmake_cmd(self) -> t.List[str]:
+        """Build the CMake configuration command
+
+        :return: CMake command with correct options
+        """
+
+        def on_off(expression: bool) -> t.Literal["ON", "OFF"]:
+            return "ON" if expression else "OFF"
+
+        cmake_args = {
+            "BUILD_TF": on_off(self.build_tensorflow),
+            "BUILD_ORT": on_off(self.build_onnxruntime),
+            "BUILD_TORCH": on_off(self.build_torch),
+            "BUILD_TFLITE": "OFF",
+            "DEPS_PATH": str(self.package_path),
+            "DEVICE": "gpu" if self.platform.device.is_gpu() else "cpu",
+            "INSTALL_PATH": str(CONFIG.lib_path),
+            "CMAKE_C_COMPILER": self.build_env.CC,
+            "CMAKE_CXX_COMPILER": self.build_env.CXX,
+        }
+        if self.platform.device.is_rocm():
+            cmake_args["Torch_DIR"] = str(self.package_path / "libtorch")
+        cmd = ["cmake"]
+        cmd += (f"-D{key}={value}" for key, value in cmake_args.items())
+        cmd.append(str(self.src_path))
+        return cmd
+
+    @property
+    def _rai_build_cmd(self) -> t.List[str]:
+        """Shell command to build RedisAI and modules
+
+        With the CMake based install, very little needs to be done here.
+        "make install" is used to ensure that all resulting RedisAI backends
+        and their dependencies end up in the same location with the correct
+        RPATH if applicable.
+
+        :return: Command used to compile RedisAI and backends
+        """
+        return "make install -j VERBOSE=1".split(" ")
+
+    def _patch_source_files(self, patches: t.Tuple[RAIPatch, ...]) -> None:
+        """Apply specified RedisAI patches"""
+        for patch in patches:
+            with fileinput.input(
+                str(self.src_path / patch.source_file), inplace=True
+            ) as file_handle:
+                for line in file_handle:
+                    line = patch.regex.sub(patch.replacement, line)
+                    print(line, end="")
diff --git a/smartsim/_core/_install/types.py b/smartsim/_core/_install/types.py
new file mode 100644
index 0000000000..0266ace341
--- /dev/null
+++ b/smartsim/_core/_install/types.py
@@ -0,0 +1,30 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+import typing as t
+
+PathLike = t.Union[str, pathlib.Path]
diff --git a/smartsim/_core/_install/utils/__init__.py b/smartsim/_core/_install/utils/__init__.py
new file mode 100644
index 0000000000..4e47cf282b
--- /dev/null
+++ b/smartsim/_core/_install/utils/__init__.py
@@ -0,0 +1,27 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from .retrieve import retrieve
diff --git a/smartsim/_core/_install/utils/retrieve.py b/smartsim/_core/_install/utils/retrieve.py
new file mode 100644
index 0000000000..fcac565d4b
--- /dev/null
+++ b/smartsim/_core/_install/utils/retrieve.py
@@ -0,0 +1,185 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import pathlib
+import shutil
+import tarfile
+import typing as t
+import zipfile
+from urllib.parse import urlparse
+from urllib.request import urlretrieve
+
+import git
+from tqdm import tqdm
+
+from smartsim._core._install.platform import Architecture, OperatingSystem
+from smartsim._core._install.types import PathLike
+
+
+class UnsupportedArchive(Exception):
+    pass
+
+
+class _TqdmUpTo(tqdm):  # type: ignore[type-arg]
+    """Provides `update_to(n)` which uses `tqdm.update(delta_n)`
+
+    From tqdm doumentation for progress bar when downloading
+    """
+
+    def update_to(
+        self, num_blocks: int = 1, bsize: int = 1, tsize: t.Optional[int] = None
+    ) -> t.Optional[bool]:
+        """Update progress in tqdm-like way
+
+        :param b: number of blocks transferred so far, defaults to 1
+        :param bsize: size of each block (in tqdm units), defaults to 1
+        :param tsize: total size (in tqdm units), defaults to None
+        :return: Update
+        """
+
+        if tsize is not None:
+            self.total = tsize
+        return self.update(num_blocks * bsize - self.n)  # also sets self.n = b * bsize
+
+
+def _from_local_archive(
+    source: PathLike,
+    destination: pathlib.Path,
+    **kwargs: t.Any,
+) -> None:
+    """Decompress a local archive
+
+    :param source: Path to the archive on a local system
+    :param destination: Where to unpack the archive
+    """
+    if tarfile.is_tarfile(source):
+        with tarfile.open(source) as archive:
+            archive.extractall(path=destination, **kwargs)
+    if zipfile.is_zipfile(source):
+        with zipfile.ZipFile(source) as archive:
+            archive.extractall(path=destination, **kwargs)
+
+
+def _from_local_directory(
+    source: PathLike,
+    destination: pathlib.Path,
+    **kwargs: t.Any,
+) -> None:
+    """Copy the contents of a directory
+
+    :param source: source directory
+    :param destination: desitnation directory
+    """
+    shutil.copytree(source, destination, **kwargs)
+
+
+def _from_http(
+    source: str,
+    destination: pathlib.Path,
+    **kwargs: t.Any,
+) -> None:
+    """Download and decompress a package
+
+    :param source: URL to a particular package
+    :param destination: Where to unpack the archive
+    """
+    with _TqdmUpTo(
+        unit="B",
+        unit_scale=True,
+        unit_divisor=1024,
+        miniters=1,
+        desc=source.split("/")[-1],
+    ) as _t:  # all optional kwargs
+        local_file, _ = urlretrieve(source, reporthook=_t.update_to, **kwargs)
+        _t.total = _t.n
+
+    _from_local_archive(local_file, destination)
+    os.remove(local_file)
+
+
+def _from_git(source: str, destination: pathlib.Path, **clone_kwargs: t.Any) -> None:
+    """Clone a repository
+
+    :param source: Path to the remote (URL or local) repository
+    :param destination: where to clone the repository
+    :param clone_kwargs: various options to send to the clone command
+    """
+    is_mac = OperatingSystem.autodetect() == OperatingSystem.DARWIN
+    is_arm64 = Architecture.autodetect() == Architecture.ARM64
+    if is_mac and is_arm64:
+        config_options = ["--config core.autocrlf=false", "--config core.eol=lf"]
+        allow_unsafe_options = True
+    else:
+        config_options = None
+        allow_unsafe_options = False
+    git.Repo.clone_from(
+        source,
+        destination,
+        multi_options=config_options,
+        allow_unsafe_options=allow_unsafe_options,
+        **clone_kwargs,
+    )
+
+
+def retrieve(
+    source: PathLike, destination: pathlib.Path, **retrieve_kwargs: t.Any
+) -> None:
+    """Primary method for retrieval
+
+    Automatically choose the correct method based on the extension and/or source
+    of the archive. If downloaded, this will also decompress the archive and
+    extract
+
+    :param source: URL or path to find the package
+    :param destination: where to place the package
+    :raises UnsupportedArchive: Unknown archive type
+    :raises FileNotFound: Path to archive does not exist
+    """
+    parsed_url = urlparse(str(source))
+    url_scheme = parsed_url.scheme
+    if parsed_url.path.endswith(".git"):
+        _from_git(str(source), destination, **retrieve_kwargs)
+    elif url_scheme == "http":
+        _from_http(str(source), destination, **retrieve_kwargs)
+    elif url_scheme == "https":
+        _from_http(str(source), destination, **retrieve_kwargs)
+    else:  # This is probably a path
+        source_path = pathlib.Path(source)
+        if not source_path.exists():
+            raise FileNotFoundError(f"Package path or file does not exist: {source}")
+        if source_path.is_dir():
+            _from_local_directory(source, destination, **retrieve_kwargs)
+        elif source_path.is_file() and source_path.suffix in (
+            ".gz",
+            ".zip",
+            ".tgz",
+        ):
+            _from_local_archive(source, destination, **retrieve_kwargs)
+        else:
+            raise UnsupportedArchive(
+                f"Source ({source}) is not a supported archive or directory "
+            )
diff --git a/smartsim/_core/commands/command_list.py b/smartsim/_core/commands/command_list.py
index 9554776e8d..d3d6eace4d 100644
--- a/smartsim/_core/commands/command_list.py
+++ b/smartsim/_core/commands/command_list.py
@@ -83,8 +83,8 @@ def __setitem__(
                 isinstance(item, str) for item in sublist.command
             ):
                 raise TypeError(
-                    "Value sublists must be a list of Commands when \
-assigning to a slice"
+                    "Value sublists must be a list of Commands when assigning \
+to a slice"
                 )
         self._commands[idx] = (deepcopy(val) for val in value)
 
diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py
index af4bca6a79..478ab02da3 100644
--- a/smartsim/_core/config/config.py
+++ b/smartsim/_core/config/config.py
@@ -32,6 +32,7 @@
 
 import psutil
 
+
 # Configuration Values
 #
 # These values can be set through environment variables to
@@ -94,11 +95,14 @@ def database_file_parse_interval(self) -> int:
     @property
     def dragon_dotenv(self) -> Path:
         """Returns the path to a .env file containing dragon environment variables"""
-        return self.conf_dir / "dragon" / ".env"
+        return Path(self.conf_dir / "dragon" / ".env")
 
     @property
     def dragon_server_path(self) -> t.Optional[str]:
-        return os.getenv("SMARTSIM_DRAGON_SERVER_PATH", None)
+        return os.getenv(
+            "SMARTSIM_DRAGON_SERVER_PATH",
+            os.getenv("_SMARTSIM_DRAGON_SERVER_PATH_EXP", None),
+        )
 
     @property
     def dragon_server_timeout(self) -> int:
@@ -225,10 +229,6 @@ def smartsim_key_path(self) -> str:
         default_path = Path.home() / ".smartsim" / "keys"
         return os.environ.get("SMARTSIM_KEY_PATH", str(default_path))
 
-    @property
-    def dragon_pin(self) -> str:
-        return "0.9"
-
 
 @lru_cache(maxsize=128, typed=False)
 def get_config() -> Config:
diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py
index 5cf5aea8b6..bb8ab31ea5 100644
--- a/smartsim/_core/control/job.py
+++ b/smartsim/_core/control/job.py
@@ -76,8 +76,8 @@ def __init__(self) -> None:
 
     @property
     def is_fs(self) -> bool:
-        """Returns `True` if the entity represents a feature store or
-        feature store shard"""
+        """Returns `True` if the entity represents a feature store or feature
+        store shard"""
         return self.type in ["featurestore", "fsnode"]
 
     @property
diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py
new file mode 100644
index 0000000000..719c2a60fe
--- /dev/null
+++ b/smartsim/_core/entrypoints/service.py
@@ -0,0 +1,185 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import datetime
+import time
+import typing as t
+from abc import ABC, abstractmethod
+
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class Service(ABC):
+    """Core API for standalone entrypoint scripts. Makes use of overridable hook
+    methods to modify behaviors (event loop, automatic shutdown, cooldown) as
+    well as simple hooks for status changes"""
+
+    def __init__(
+        self,
+        as_service: bool = False,
+        cooldown: float = 0,
+        loop_delay: float = 0,
+        health_check_frequency: float = 0,
+    ) -> None:
+        """Initialize the Service
+
+        :param as_service: Determines lifetime of the service. When `True`, calling
+        execute on the service will run continuously until shutdown criteria are met.
+        Otherwise, `execute` performs a single pass through the service lifecycle and
+        automatically exits (regardless of the result of `_can_shutdown`).
+        :param cooldown: Period of time (in seconds) to allow the service to run
+        after a shutdown is permitted. Enables the service to avoid restarting if
+        new work is discovered. A value of 0 disables the cooldown.
+        :param loop_delay: Duration (in seconds) of a forced delay between
+        iterations of the event loop
+        :param health_check_frequency: Time (in seconds) between calls to a
+        health check handler. A value of 0 triggers the health check on every
+        iteration.
+        """
+        self._as_service = as_service
+        """Determines lifetime of the service. When `True`, calling
+         `execute` on the service will run continuously until shutdown criteria are met.
+         Otherwise, `execute` performs a single pass through the service lifecycle and
+         automatically exits (regardless of the result of `_can_shutdown`)."""
+        self._cooldown = abs(cooldown)
+        """Period of time (in seconds) to allow the service to run
+        after a shutdown is permitted. Enables the service to avoid restarting if
+        new work is discovered. A value of 0 disables the cooldown."""
+        self._loop_delay = abs(loop_delay)
+        """Duration (in seconds) of a forced delay between
+         iterations of the event loop"""
+        self._health_check_frequency = health_check_frequency
+        """Time (in seconds) between calls to a
+         health check handler. A value of 0 triggers the health check on every
+         iteration."""
+        self._last_health_check = time.time()
+        """The timestamp of the latest health check"""
+
+    @abstractmethod
+    def _on_iteration(self) -> None:
+        """The user-defined event handler. Executed repeatedly until shutdown
+        conditions are satisfied and cooldown is elapsed.
+        """
+
+    @abstractmethod
+    def _can_shutdown(self) -> bool:
+        """Return true when the criteria to shut down the service are met."""
+
+    def _on_start(self) -> None:
+        """Empty hook method for use by subclasses. Called on initial entry into
+        Service `execute` event loop before `_on_iteration` is invoked."""
+        logger.debug(f"Starting {self.__class__.__name__}")
+
+    def _on_shutdown(self) -> None:
+        """Empty hook method for use by subclasses. Called immediately after exiting
+        the main event loop during automatic shutdown."""
+        logger.debug(f"Shutting down {self.__class__.__name__}")
+
+    def _on_health_check(self) -> None:
+        """Empty hook method for use by subclasses. Invoked based on the
+        value of `self._health_check_frequency`."""
+        logger.debug(f"Performing health check for {self.__class__.__name__}")
+
+    def _on_cooldown_elapsed(self) -> None:
+        """Empty hook method for use by subclasses. Called on every event loop
+        iteration immediately upon exceeding the cooldown period"""
+        logger.debug(f"Cooldown exceeded by {self.__class__.__name__}")
+
+    def _on_delay(self) -> None:
+        """Empty hook method for use by subclasses. Called on every event loop
+        iteration immediately before executing a delay before the next iteration"""
+        logger.debug(f"Service iteration waiting for {self.__class__.__name__}s")
+
+    def _log_cooldown(self, elapsed: float) -> None:
+        """Log the remaining cooldown time, if any"""
+        remaining = self._cooldown - elapsed
+        if remaining > 0:
+            logger.debug(f"{abs(remaining):.2f}s remains of {self._cooldown}s cooldown")
+        else:
+            logger.info(f"exceeded cooldown {self._cooldown}s by {abs(remaining):.2f}s")
+
+    def execute(self) -> None:
+        """The main event loop of a service host. Evaluates shutdown criteria and
+        combines with a cooldown period to allow automatic service termination.
+        Responsible for executing calls to subclass implementation of `_on_iteration`"""
+
+        try:
+            self._on_start()
+        except Exception:
+            logger.exception("Unable to start service.")
+            return
+
+        running = True
+        cooldown_start: t.Optional[datetime.datetime] = None
+
+        while running:
+            try:
+                self._on_iteration()
+            except Exception:
+                running = False
+                logger.exception(
+                    "Failure in event loop resulted in service termination"
+                )
+
+            if self._health_check_frequency >= 0:
+                hc_elapsed = time.time() - self._last_health_check
+                if hc_elapsed >= self._health_check_frequency:
+                    self._on_health_check()
+                    self._last_health_check = time.time()
+
+            # allow immediate shutdown if not set to run as a service
+            if not self._as_service:
+                running = False
+                continue
+
+            # reset cooldown period if shutdown criteria are not met
+            if not self._can_shutdown():
+                cooldown_start = None
+
+            # start tracking cooldown elapsed once eligible to quit
+            if cooldown_start is None:
+                cooldown_start = datetime.datetime.now()
+
+            # change running state if cooldown period is exceeded
+            if self._cooldown > 0:
+                elapsed = datetime.datetime.now() - cooldown_start
+                running = elapsed.total_seconds() < self._cooldown
+                self._log_cooldown(elapsed.total_seconds())
+                if not running:
+                    self._on_cooldown_elapsed()
+            elif self._cooldown < 1 and self._can_shutdown():
+                running = False
+
+            if self._loop_delay:
+                self._on_delay()
+                time.sleep(self._loop_delay)
+
+        try:
+            self._on_shutdown()
+        except Exception:
+            logger.exception("Service shutdown may not have completed.")
diff --git a/smartsim/_core/launcher/dragon/dragon_backend.py b/smartsim/_core/launcher/dragon/dragon_backend.py
index 7d77aaaacc..82863d73b5 100644
--- a/smartsim/_core/launcher/dragon/dragon_backend.py
+++ b/smartsim/_core/launcher/dragon/dragon_backend.py
@@ -26,6 +26,8 @@
 import collections
 import functools
 import itertools
+import os
+import socket
 import time
 import typing as t
 from dataclasses import dataclass, field
@@ -34,15 +36,27 @@
 
 from tabulate import tabulate
 
-# pylint: disable=import-error
+# pylint: disable=import-error,C0302,R0915
 # isort: off
+
 import dragon.infrastructure.connection as dragon_connection
 import dragon.infrastructure.policy as dragon_policy
-import dragon.native.group_state as dragon_group_state
+import dragon.infrastructure.process_desc as dragon_process_desc
+
 import dragon.native.process as dragon_process
 import dragon.native.process_group as dragon_process_group
 import dragon.native.machine as dragon_machine
 
+from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter
+from smartsim._core.mli.infrastructure.control.listener import (
+    ConsumerRegistrationListener,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
+from smartsim.error.errors import SmartSimError
+
 # pylint: enable=import-error
 # isort: on
 from ....log import get_logger
@@ -68,8 +82,8 @@
 
 
 class DragonStatus(str, Enum):
-    ERROR = str(dragon_group_state.Error())
-    RUNNING = str(dragon_group_state.Running())
+    ERROR = "Error"
+    RUNNING = "Running"
 
     def __str__(self) -> str:
         return self.value
@@ -86,7 +100,7 @@ class ProcessGroupInfo:
     return_codes: t.Optional[t.List[int]] = None
     """List of return codes of completed processes"""
     hosts: t.List[str] = field(default_factory=list)
-    """List of hosts on which the Process Group """
+    """List of hosts on which the Process Group should be executed"""
     redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None
     """Workers used to redirect stdout and stderr to file"""
 
@@ -143,6 +157,11 @@ class DragonBackend:
     by threads spawned by it.
     """
 
+    _DEFAULT_NUM_MGR_PER_NODE = 2
+    """The default number of manager processes for each feature store node"""
+    _DEFAULT_MEM_PER_NODE = 512 * 1024**2
+    """The default memory capacity (in bytes) to allocate for a feaure store node"""
+
     def __init__(self, pid: int) -> None:
         self._pid = pid
         """PID of dragon executable which launched this server"""
@@ -153,7 +172,6 @@ def __init__(self, pid: int) -> None:
         self._step_ids = (f"{create_short_id_str()}-{id}" for id in itertools.count())
         """Incremental ID to assign to new steps prior to execution"""
 
-        self._initialize_hosts()
         self._queued_steps: "collections.OrderedDict[str, DragonRunRequest]" = (
             collections.OrderedDict()
         )
@@ -177,16 +195,26 @@ def __init__(self, pid: int) -> None:
         """Whether the server frontend should shut down when the backend does"""
         self._shutdown_initiation_time: t.Optional[float] = None
         """The time at which the server initiated shutdown"""
-        smartsim_config = get_config()
-        self._cooldown_period = (
-            smartsim_config.telemetry_frequency * 2 + 5
-            if smartsim_config.telemetry_enabled
-            else 5
-        )
-        """Time in seconds needed to server to complete shutdown"""
+        self._cooldown_period = self._initialize_cooldown()
+        """Time in seconds needed by the server to complete shutdown"""
+        self._backbone: t.Optional[BackboneFeatureStore] = None
+        """The backbone feature store"""
+        self._listener: t.Optional[dragon_process.Process] = None
+        """The standalone process executing the event consumer"""
+
+        self._nodes: t.List["dragon_machine.Node"] = []
+        """Node capability information for hosts in the allocation"""
+        self._hosts: t.List[str] = []
+        """List of hosts available in allocation"""
+        self._cpus: t.List[int] = []
+        """List of cpu-count by node"""
+        self._gpus: t.List[int] = []
+        """List of gpu-count by node"""
+        self._allocated_hosts: t.Dict[str, t.Set[str]] = {}
+        """Mapping with hostnames as keys and a set of running step IDs as the value"""
 
-        self._view = DragonBackendView(self)
-        logger.debug(self._view.host_desc)
+        self._initialize_hosts()
+        self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock)
 
     @property
     def hosts(self) -> list[str]:
@@ -194,34 +222,39 @@ def hosts(self) -> list[str]:
             return self._hosts
 
     @property
-    def allocated_hosts(self) -> dict[str, str]:
+    def allocated_hosts(self) -> dict[str, t.Set[str]]:
+        """A map of host names to the step id executing on a host
+
+        :returns: Dictionary with host name as key and step id as value"""
         with self._queue_lock:
             return self._allocated_hosts
 
     @property
-    def free_hosts(self) -> t.Deque[str]:
+    def free_hosts(self) -> t.Sequence[str]:
+        """Find hosts that do not have a step assigned
+
+        :returns: List of host names"""
         with self._queue_lock:
-            return self._free_hosts
+            return list(map(lambda x: x.hostname, self._prioritizer.unassigned()))
 
     @property
     def group_infos(self) -> dict[str, ProcessGroupInfo]:
+        """Find information pertaining to process groups executing on a host
+
+        :returns: Dictionary with host name as key and group information as value"""
         with self._queue_lock:
             return self._group_infos
 
     def _initialize_hosts(self) -> None:
+        """Prepare metadata about the allocation"""
         with self._queue_lock:
             self._nodes = [
                 dragon_machine.Node(node) for node in dragon_machine.System().nodes
             ]
-            self._hosts: t.List[str] = sorted(node.hostname for node in self._nodes)
+            self._hosts = sorted(node.hostname for node in self._nodes)
             self._cpus = [node.num_cpus for node in self._nodes]
             self._gpus = [node.num_gpus for node in self._nodes]
-
-            """List of hosts available in allocation"""
-            self._free_hosts: t.Deque[str] = collections.deque(self._hosts)
-            """List of hosts on which steps can be launched"""
-            self._allocated_hosts: t.Dict[str, str] = {}
-            """Mapping of hosts on which a step is already running to step ID"""
+            self._allocated_hosts = collections.defaultdict(set)
 
     def __str__(self) -> str:
         return self.status_message
@@ -230,21 +263,19 @@ def __str__(self) -> str:
     def status_message(self) -> str:
         """Message with status of available nodes and history of launched jobs.
 
-        :returns: Status message
+        :returns: a status message
         """
-        return (
-            "Dragon server backend update\n"
-            f"{self._view.host_table}\n{self._view.step_table}"
-        )
+        view = DragonBackendView(self)
+        return "Dragon server backend update\n" f"{view.host_table}\n{view.step_table}"
 
     def _heartbeat(self) -> None:
+        """Update the value of the last heartbeat to the current time."""
         self._last_beat = self.current_time
 
     @property
     def cooldown_period(self) -> int:
-        """Time (in seconds) the server will wait before shutting down
-
-        when exit conditions are met (see ``should_shutdown()`` for further details).
+        """Time (in seconds) the server will wait before shutting down when
+        exit conditions are met (see ``should_shutdown()`` for further details).
         """
         return self._cooldown_period
 
@@ -278,6 +309,8 @@ def should_shutdown(self) -> bool:
         and it requested immediate shutdown, or if it did not request immediate
         shutdown, but all jobs have been executed.
         In both cases, a cooldown period may need to be waited before shutdown.
+
+        :returns: `True` if the server should terminate, otherwise `False`
         """
         if self._shutdown_requested and self._can_shutdown:
             return self._has_cooled_down
@@ -285,7 +318,9 @@ def should_shutdown(self) -> bool:
 
     @property
     def current_time(self) -> float:
-        """Current time for DragonBackend object, in seconds since the Epoch"""
+        """Current time for DragonBackend object, in seconds since the Epoch
+
+        :returns: the current timestamp"""
         return time.time()
 
     def _can_honor_policy(
@@ -293,63 +328,149 @@ def _can_honor_policy(
     ) -> t.Tuple[bool, t.Optional[str]]:
         """Check if the policy can be honored with resources available
         in the allocation.
-        :param request: DragonRunRequest containing policy information
+
+        :param request: `DragonRunRequest` to validate
         :returns: Tuple indicating if the policy can be honored and
         an optional error message"""
         # ensure the policy can be honored
         if request.policy:
+            logger.debug(f"{request.policy=}{self._cpus=}{self._gpus=}")
+
             if request.policy.cpu_affinity:
                 # make sure some node has enough CPUs
-                available = max(self._cpus)
+                last_available = max(self._cpus or [-1])
                 requested = max(request.policy.cpu_affinity)
-
-                if requested >= available:
+                if not any(self._cpus) or requested >= last_available:
                     return False, "Cannot satisfy request, not enough CPUs available"
-
             if request.policy.gpu_affinity:
                 # make sure some node has enough GPUs
-                available = max(self._gpus)
+                last_available = max(self._gpus or [-1])
                 requested = max(request.policy.gpu_affinity)
-
-                if requested >= available:
+                if not any(self._gpus) or requested >= last_available:
+                    logger.warning(
+                        f"failed check w/{self._gpus=}, {requested=}, {last_available=}"
+                    )
                     return False, "Cannot satisfy request, not enough GPUs available"
-
         return True, None
 
     def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]:
-        """Check if request can be honored with resources available in the allocation.
-
-        Currently only checks for total number of nodes,
-        in the future it will also look at other constraints
-        such as memory, accelerators, and so on.
+        """Check if request can be honored with resources available in
+        the allocation. Currently only checks for total number of nodes,
+        in the future it will also look at other constraints such as memory,
+        accelerators, and so on.
+
+        :param request: `DragonRunRequest` to validate
+        :returns: Tuple indicating if the request can be honored and
+        an optional error message
         """
-        if request.nodes > len(self._hosts):
-            message = f"Cannot satisfy request. Requested {request.nodes} nodes, "
-            message += f"but only {len(self._hosts)} nodes are available."
-            return False, message
-        if self._shutdown_requested:
-            message = "Cannot satisfy request, server is shutting down."
-            return False, message
+        honorable, err = self._can_honor_state(request)
+        if not honorable:
+            return False, err
 
         honorable, err = self._can_honor_policy(request)
         if not honorable:
             return False, err
 
+        honorable, err = self._can_honor_hosts(request)
+        if not honorable:
+            return False, err
+
+        return True, None
+
+    def _can_honor_hosts(
+        self, request: DragonRunRequest
+    ) -> t.Tuple[bool, t.Optional[str]]:
+        """Check if the current state of the backend process inhibits executing
+        the request.
+
+        :param request: `DragonRunRequest` to validate
+        :returns: Tuple indicating if the request can be honored and
+        an optional error message"""
+        all_hosts = frozenset(self._hosts)
+        num_nodes = request.nodes
+
+        # fail if requesting more nodes than the total number available
+        if num_nodes > len(all_hosts):
+            message = f"Cannot satisfy request. {num_nodes} requested nodes"
+            message += f" exceeds {len(all_hosts)} available."
+            return False, message
+
+        requested_hosts = all_hosts
+        if request.hostlist:
+            requested_hosts = frozenset(
+                {host.strip() for host in request.hostlist.split(",")}
+            )
+
+        valid_hosts = all_hosts.intersection(requested_hosts)
+        invalid_hosts = requested_hosts - valid_hosts
+
+        logger.debug(f"{num_nodes=}{valid_hosts=}{invalid_hosts=}")
+
+        if invalid_hosts:
+            logger.warning(f"Some invalid hostnames were requested: {invalid_hosts}")
+
+        # fail if requesting specific hostnames and there aren't enough available
+        if num_nodes > len(valid_hosts):
+            message = f"Cannot satisfy request. Requested {num_nodes} nodes, "
+            message += f"but only {len(valid_hosts)} named hosts are available."
+            return False, message
+
+        return True, None
+
+    def _can_honor_state(
+        self, _request: DragonRunRequest
+    ) -> t.Tuple[bool, t.Optional[str]]:
+        """Check if the current state of the backend process inhibits executing
+        the request.
+        :param _request: the DragonRunRequest to verify
+        :returns: Tuple indicating if the request can be honored and
+        an optional error message"""
+        if self._shutdown_requested:
+            message = "Cannot satisfy request, server is shutting down."
+            return False, message
+
         return True, None
 
     def _allocate_step(
         self, step_id: str, request: DragonRunRequest
     ) -> t.Optional[t.List[str]]:
+        """Identify the hosts on which the request will be executed
 
+        :param step_id: The identifier of a step that will be executed on the host
+        :param request: The request to be executed
+        :returns: A list of selected hostnames"""
+        # ensure at least one host is selected
         num_hosts: int = request.nodes
+
         with self._queue_lock:
-            if num_hosts <= 0 or num_hosts > len(self._free_hosts):
+            if num_hosts <= 0 or num_hosts > len(self._hosts):
+                logger.debug(
+                    f"The number of requested hosts ({num_hosts}) is invalid or"
+                    f" cannot be satisfied with {len(self._hosts)} available nodes"
+                )
                 return None
-            to_allocate = []
-            for _ in range(num_hosts):
-                host = self._free_hosts.popleft()
-                self._allocated_hosts[host] = step_id
-                to_allocate.append(host)
+
+            hosts = []
+            if request.hostlist:
+                # convert the comma-separated argument into a real list
+                hosts = [host for host in request.hostlist.split(",") if host]
+
+            filter_on: t.Optional[PrioritizerFilter] = None
+            if request.policy and request.policy.gpu_affinity:
+                filter_on = PrioritizerFilter.GPU
+
+            nodes = self._prioritizer.next_n(num_hosts, filter_on, step_id, hosts)
+
+            if len(nodes) < num_hosts:
+                # exit if the prioritizer can't identify enough nodes
+                return None
+
+            to_allocate = [node.hostname for node in nodes]
+
+            for hostname in to_allocate:
+                # track assigning this step to each node
+                self._allocated_hosts[hostname].add(step_id)
+
             return to_allocate
 
     @staticmethod
@@ -389,6 +510,7 @@ def _create_redirect_workers(
         return grp_redir
 
     def _stop_steps(self) -> None:
+        """Trigger termination of all currently executing steps"""
         self._heartbeat()
         with self._queue_lock:
             while len(self._stop_requests) > 0:
@@ -427,18 +549,96 @@ def _stop_steps(self) -> None:
                 self._group_infos[step_id].status = JobStatus.CANCELLED
                 self._group_infos[step_id].return_codes = [-9]
 
+    def _create_backbone(self) -> BackboneFeatureStore:
+        """
+        Creates a BackboneFeatureStore if one does not exist. Updates
+        environment variables of this process to include the backbone
+        descriptor.
+
+        :returns: The backbone feature store
+        """
+        if self._backbone is None:
+            backbone_storage = create_ddict(
+                len(self._hosts),
+                self._DEFAULT_NUM_MGR_PER_NODE,
+                self._DEFAULT_MEM_PER_NODE,
+            )
+
+            self._backbone = BackboneFeatureStore(
+                backbone_storage, allow_reserved_writes=True
+            )
+
+            # put the backbone descriptor in the env vars
+            os.environ.update(self._backbone.get_env())
+
+        return self._backbone
+
+    @staticmethod
+    def _initialize_cooldown() -> int:
+        """Load environment configuration and determine the correct cooldown
+        period to apply to the backend process.
+
+        :returns: The calculated cooldown (in seconds)
+        """
+        smartsim_config = get_config()
+        return (
+            smartsim_config.telemetry_frequency * 2 + 5
+            if smartsim_config.telemetry_enabled
+            else 5
+        )
+
+    def start_event_listener(
+        self, cpu_affinity: list[int], gpu_affinity: list[int]
+    ) -> dragon_process.Process:
+        """Start a standalone event listener.
+
+        :param cpu_affinity: The CPU affinity for the process
+        :param gpu_affinity: The GPU affinity for the process
+        :returns: The dragon Process managing the process
+        :raises SmartSimError: If the backbone is not provided
+        """
+        if self._backbone is None:
+            raise SmartSimError("Backbone feature store is not available")
+
+        service = ConsumerRegistrationListener(
+            self._backbone, 1.0, 2.0, as_service=True, health_check_frequency=90
+        )
+
+        options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
+        local_policy = dragon_policy.Policy(
+            placement=dragon_policy.Policy.Placement.HOST_NAME,
+            host_name=socket.gethostname(),
+            cpu_affinity=cpu_affinity,
+            gpu_affinity=gpu_affinity,
+        )
+        process = dragon_process.Process(
+            target=service.execute,
+            args=[],
+            cwd=os.getcwd(),
+            env={
+                **os.environ,
+                **self._backbone.get_env(),
+            },
+            policy=local_policy,
+            options=options,
+            stderr=dragon_process.Popen.STDOUT,
+            stdout=dragon_process.Popen.STDOUT,
+        )
+        process.start()
+        return process
+
     @staticmethod
     def create_run_policy(
         request: DragonRequest, node_name: str
     ) -> "dragon_policy.Policy":
         """Create a dragon Policy from the request and node name
+
         :param request: DragonRunRequest containing policy information
         :param node_name: Name of the node on which the process will run
         :returns: dragon_policy.Policy object mapped from request properties"""
         if isinstance(request, DragonRunRequest):
             run_request: DragonRunRequest = request
 
-            affinity = dragon_policy.Policy.Affinity.DEFAULT
             cpu_affinity: t.List[int] = []
             gpu_affinity: t.List[int] = []
 
@@ -446,25 +646,20 @@ def create_run_policy(
             if run_request.policy is not None:
                 # Affinities are not mutually exclusive. If specified, both are used
                 if run_request.policy.cpu_affinity:
-                    affinity = dragon_policy.Policy.Affinity.SPECIFIC
                     cpu_affinity = run_request.policy.cpu_affinity
 
                 if run_request.policy.gpu_affinity:
-                    affinity = dragon_policy.Policy.Affinity.SPECIFIC
                     gpu_affinity = run_request.policy.gpu_affinity
             logger.debug(
-                f"Affinity strategy: {affinity}, "
                 f"CPU affinity mask: {cpu_affinity}, "
                 f"GPU affinity mask: {gpu_affinity}"
             )
-            if affinity != dragon_policy.Policy.Affinity.DEFAULT:
-                return dragon_policy.Policy(
-                    placement=dragon_policy.Policy.Placement.HOST_NAME,
-                    host_name=node_name,
-                    affinity=affinity,
-                    cpu_affinity=cpu_affinity,
-                    gpu_affinity=gpu_affinity,
-                )
+            return dragon_policy.Policy(
+                placement=dragon_policy.Policy.Placement.HOST_NAME,
+                host_name=node_name,
+                cpu_affinity=cpu_affinity,
+                gpu_affinity=gpu_affinity,
+            )
 
         return dragon_policy.Policy(
             placement=dragon_policy.Policy.Placement.HOST_NAME,
@@ -472,7 +667,9 @@ def create_run_policy(
         )
 
     def _start_steps(self) -> None:
+        """Start all new steps created since the last update."""
         self._heartbeat()
+
         with self._queue_lock:
             started = []
             for step_id, request in self._queued_steps.items():
@@ -482,10 +679,8 @@ def _start_steps(self) -> None:
 
                 logger.debug(f"Step id {step_id} allocated on {hosts}")
 
-                global_policy = dragon_policy.Policy(
-                    placement=dragon_policy.Policy.Placement.HOST_NAME,
-                    host_name=hosts[0],
-                )
+                global_policy = self.create_run_policy(request, hosts[0])
+                options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
                 grp = dragon_process_group.ProcessGroup(
                     restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy
                 )
@@ -498,10 +693,15 @@ def _start_steps(self) -> None:
                         target=request.exe,
                         args=request.exe_args,
                         cwd=request.path,
-                        env={**request.current_env, **request.env},
+                        env={
+                            **request.current_env,
+                            **request.env,
+                            **(self._backbone.get_env() if self._backbone else {}),
+                        },
                         stdout=dragon_process.Popen.PIPE,
                         stderr=dragon_process.Popen.PIPE,
                         policy=local_policy,
+                        options=options,
                     )
                     grp.add_process(nproc=request.tasks_per_node, template=tmp_proc)
 
@@ -567,9 +767,11 @@ def _start_steps(self) -> None:
                     logger.error(e)
 
     def _refresh_statuses(self) -> None:
+        """Query underlying management system for step status and update
+        stored assigned and unassigned task information"""
         self._heartbeat()
         with self._queue_lock:
-            terminated = []
+            terminated: t.Set[str] = set()
             for step_id in self._running_steps:
                 group_info = self._group_infos[step_id]
                 grp = group_info.process_group
@@ -603,11 +805,15 @@ def _refresh_statuses(self) -> None:
                             )
 
                 if group_info.status in TERMINAL_STATUSES:
-                    terminated.append(step_id)
+                    terminated.add(step_id)
 
             if terminated:
                 logger.debug(f"{terminated=}")
 
+            # remove all the terminated steps from all hosts
+            for host in list(self._allocated_hosts.keys()):
+                self._allocated_hosts[host].difference_update(terminated)
+
             for step_id in terminated:
                 self._running_steps.remove(step_id)
                 self._completed_steps.append(step_id)
@@ -615,15 +821,20 @@ def _refresh_statuses(self) -> None:
                 if group_info is not None:
                     for host in group_info.hosts:
                         logger.debug(f"Releasing host {host}")
-                        try:
-                            self._allocated_hosts.pop(host)
-                        except KeyError:
+                        if host not in self._allocated_hosts:
                             logger.error(f"Tried to free a non-allocated host: {host}")
-                        self._free_hosts.append(host)
+                        else:
+                            # remove any hosts that have had all their steps terminated
+                            if not self._allocated_hosts[host]:
+                                self._allocated_hosts.pop(host)
+                        self._prioritizer.decrement(host, step_id)
                     group_info.process_group = None
                     group_info.redir_workers = None
 
     def _update_shutdown_status(self) -> None:
+        """Query the status of running tasks and update the status
+        of any that have completed.
+        """
         self._heartbeat()
         with self._queue_lock:
             self._can_shutdown |= (
@@ -637,12 +848,18 @@ def _update_shutdown_status(self) -> None:
             )
 
     def _should_print_status(self) -> bool:
+        """Determine if status messages should be printed based off the last
+        update. Returns `True` to trigger prints, `False` otherwise.
+        """
         if self.current_time - self._last_update_time > 10:
             self._last_update_time = self.current_time
             return True
         return False
 
     def _update(self) -> None:
+        """Trigger all update queries and update local state database"""
+        self._create_backbone()
+
         self._stop_steps()
         self._start_steps()
         self._refresh_statuses()
@@ -650,6 +867,9 @@ def _update(self) -> None:
 
     def _kill_all_running_jobs(self) -> None:
         with self._queue_lock:
+            if self._listener and self._listener.is_alive:
+                self._listener.kill()
+
             for step_id, group_info in self._group_infos.items():
                 if group_info.status not in TERMINAL_STATUSES:
                     self._stop_requests.append(DragonStopRequest(step_id=step_id))
@@ -728,8 +948,14 @@ def _(self, request: DragonShutdownRequest) -> DragonShutdownResponse:
 
 
 class DragonBackendView:
-    def __init__(self, backend: DragonBackend):
+    def __init__(self, backend: DragonBackend) -> None:
+        """Initialize the instance
+
+        :param backend: A dragon backend used to produce the view"""
         self._backend = backend
+        """A dragon backend used to produce the view"""
+
+        logger.debug(self.host_desc)
 
     @property
     def host_desc(self) -> str:
@@ -791,9 +1017,7 @@ def step_table(self) -> str:
     @property
     def host_table(self) -> str:
         """Table representation of current state of nodes available
-
-        in the allocation.
-        """
+        in the allocation."""
         headers = ["Host", "Status"]
         hosts = self._backend.hosts
         free_hosts = self._backend.free_hosts
diff --git a/smartsim/_core/launcher/dragon/dragon_connector.py b/smartsim/_core/launcher/dragon/dragon_connector.py
index 7ff4cdc1c8..9c96592776 100644
--- a/smartsim/_core/launcher/dragon/dragon_connector.py
+++ b/smartsim/_core/launcher/dragon/dragon_connector.py
@@ -76,9 +76,11 @@ class DragonConnector:
 
     def __init__(self, path: str | os.PathLike[str]) -> None:
         self._context: zmq.Context[t.Any] = zmq.Context.instance()
+        """ZeroMQ context used to share configuration across requests"""
         self._context.setsockopt(zmq.REQ_CORRELATE, 1)
         self._context.setsockopt(zmq.REQ_RELAXED, 1)
         self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None
+        """ZeroMQ authenticator used to secure queue access"""
         config = get_config()
         self._reset_timeout(config.dragon_server_timeout)
 
@@ -88,17 +90,21 @@ def __init__(self, path: str | os.PathLike[str]) -> None:
         #       fine as we expect the that method should only be called once
         #       without hitting a guard clause.
         self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None
+        """ZeroMQ socket exposing the connection to the DragonBackend"""
         self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None
+        """A handle to the process executing the DragonBackend"""
         # Returned by dragon head, useful if shutdown is to be requested
         # but process was started by another connector
         self._dragon_head_pid: t.Optional[int] = None
+        """Process ID of the process executing the DragonBackend"""
         self._dragon_server_path = _resolve_dragon_path(path)
+        """Path to a dragon installation"""
         logger.debug(f"Dragon Server path was set to {self._dragon_server_path}")
         self._env_vars: t.Dict[str, str] = {}
 
     @property
     def is_connected(self) -> bool:
-        """Whether the Connector established a connection to the server
+        """Whether the Connector established a connection to the server.
 
         :return: True if connected
         """
@@ -107,12 +113,18 @@ def is_connected(self) -> bool:
     @property
     def can_monitor(self) -> bool:
         """Whether the Connector knows the PID of the dragon server head process
-        and can monitor its status
+        and can monitor its status.
 
         :return: True if the server can be monitored"""
         return self._dragon_head_pid is not None
 
     def _handshake(self, address: str) -> None:
+        """Perform the handshake process with the DragonBackend and
+        confirm two-way communication is established.
+
+        :param address: The address of the head node socket to initiate a
+        handhake with
+        """
         self._dragon_head_socket = dragon_sockets.get_secure_socket(
             self._context, zmq.REQ, False
         )
@@ -135,6 +147,11 @@ def _handshake(self, address: str) -> None:
             ) from e
 
     def _reset_timeout(self, timeout: int = get_config().dragon_server_timeout) -> None:
+        """Reset the timeout applied to the ZMQ context. If an authenticator is
+        enabled, also update the authenticator timeouts.
+
+        :param timeout: The timeout value to apply to ZMQ sockets
+        """
         self._context.setsockopt(zmq.SNDTIMEO, value=timeout)
         self._context.setsockopt(zmq.RCVTIMEO, value=timeout)
         if self._authenticator is not None and self._authenticator.thread is not None:
@@ -186,11 +203,19 @@ def _get_new_authenticator(
 
     @staticmethod
     def _get_dragon_log_level() -> str:
+        """Maps the log level from SmartSim to a valid log level
+        for a dragon process.
+
+        :returns: The dragon log level string
+        """
         smartsim_to_dragon = defaultdict(lambda: "NONE")
         smartsim_to_dragon["developer"] = "INFO"
         return smartsim_to_dragon.get(get_config().log_level, "NONE")
 
     def _connect_to_existing_server(self, path: Path) -> None:
+        """Connects to an existing DragonBackend using address information from
+        a persisted dragon log file.
+        """
         config = get_config()
         dragon_config_log = path / config.dragon_log_filename
 
@@ -220,6 +245,11 @@ def _connect_to_existing_server(self, path: Path) -> None:
                 return
 
     def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]:
+        """Instantiate the ZMQ socket to be used by the connector.
+
+        :param socket_addr: The socket address the connector should bind to
+        :returns: The bound socket
+        """
         config = get_config()
         connector_socket: t.Optional[zmq.Socket[t.Any]] = None
         self._reset_timeout(config.dragon_server_startup_timeout)
@@ -250,9 +280,14 @@ def load_persisted_env(self) -> t.Dict[str, str]:
 
         with open(config.dragon_dotenv, encoding="utf-8") as dot_env:
             for kvp in dot_env.readlines():
-                split = kvp.strip().split("=", maxsplit=1)
-                key, value = split[0], split[-1]
-                self._env_vars[key] = value
+                if not kvp:
+                    continue
+
+                # skip any commented lines
+                if not kvp.startswith("#"):
+                    split = kvp.strip().split("=", maxsplit=1)
+                    key, value = split[0], split[-1]
+                    self._env_vars[key] = value
 
         return self._env_vars
 
@@ -422,6 +457,15 @@ def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse
     def _parse_launched_dragon_server_info_from_iterable(
         stream: t.Iterable[str], num_dragon_envs: t.Optional[int] = None
     ) -> t.List[t.Dict[str, str]]:
+        """Parses dragon backend connection information from a stream.
+
+        :param stream: The stream to inspect. Usually the stdout of the
+        DragonBackend process
+        :param num_dragon_envs: The expected number of dragon environments
+        to parse from the stream.
+        :returns: A list of dictionaries, one per environment, containing
+        the parsed server information
+        """
         lines = (line.strip() for line in stream)
         lines = (line for line in lines if line)
         tokenized = (line.split(maxsplit=1) for line in lines)
@@ -448,6 +492,15 @@ def _parse_launched_dragon_server_info_from_files(
         file_paths: t.List[t.Union[str, "os.PathLike[str]"]],
         num_dragon_envs: t.Optional[int] = None,
     ) -> t.List[t.Dict[str, str]]:
+        """Read a known log file into a Stream and parse dragon server configuration
+        from the stream.
+
+        :param file_paths: Path to a file containing dragon server configuration
+        :num_dragon_envs: The expected number of dragon environments to be found
+        in the file
+        :returns: The parsed server configuration, one item per
+        discovered dragon environment
+        """
         with fileinput.FileInput(file_paths) as ifstream:
             dragon_envs = cls._parse_launched_dragon_server_info_from_iterable(
                 ifstream, num_dragon_envs
@@ -462,6 +515,15 @@ def _send_req_with_socket(
         send_flags: int = 0,
         recv_flags: int = 0,
     ) -> DragonResponse:
+        """Sends a synchronous request through a ZMQ socket.
+
+        :param socket: Socket to send on
+        :param request: The request to send
+        :param send_flags: Configuration to apply to the send operation
+        :param recv_flags: Configuration to apply to the recv operation; used to
+        allow the receiver to immediately respond to the sent request.
+        :returns: The response from the target
+        """
         client = dragon_sockets.as_client(socket)
         with DRG_LOCK:
             logger.debug(f"Sending {type(request).__name__}: {request}")
@@ -473,6 +535,13 @@ def _send_req_with_socket(
 
 
 def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT:
+    """Verify that objects can be sent as messages acceptable to the target.
+
+    :param obj: The message to test
+    :param typ: The type that is acceptable
+    :returns: The original `obj` if it is of the requested type
+    :raises TypeError: If the object fails the test and is not
+    an instance of the desired type"""
     if not isinstance(obj, typ):
         raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}")
     return obj
diff --git a/smartsim/_core/launcher/dragon/dragon_launcher.py b/smartsim/_core/launcher/dragon/dragon_launcher.py
index 5e36b8a3fd..4af93b68ed 100644
--- a/smartsim/_core/launcher/dragon/dragon_launcher.py
+++ b/smartsim/_core/launcher/dragon/dragon_launcher.py
@@ -72,7 +72,7 @@
 # ***************************************
 # TODO: Remove pylint disable after merge
 # ***************************************
-# pylint: disable=protected-access
+# pylint: disable=protected-access,wrong-import-position
 
 
 class DragonLauncher(WLMLauncher):
@@ -206,6 +206,8 @@ def run(self, step: Step) -> t.Optional[str]:
             self._connector.load_persisted_env()
             nodes = int(run_args.get("nodes", None) or 1)
             tasks_per_node = int(run_args.get("tasks-per-node", None) or 1)
+            hosts = run_args.get("host-list", None)
+
             policy = DragonRunPolicy.from_run_args(run_args)
             step_id = self.start(
                 (
@@ -219,6 +221,7 @@ def run(self, step: Step) -> t.Optional[str]:
                         env=req_env,
                         output_file=out,
                         error_file=err,
+                        hostlist=hosts,
                     ),
                     policy,
                 )
@@ -374,15 +377,15 @@ def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT:
     return obj
 
 
-from smartsim._core.dispatch import dispatch  # pylint: disable=wrong-import-position
+from smartsim._core.dispatch import dispatch
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 # TODO: Remove this registry and move back to builder file after fixing
 #       circular import caused by `DragonLauncher.supported_rs`
 # -----------------------------------------------------------------------------
-from smartsim.settings.arguments.launch.dragon import (  # pylint: disable=wrong-import-position
+from smartsim.settings.arguments.launch.dragon import (
     DragonLaunchArguments,
-)
+)  # pylint: disable=wrong-import-position
 
 
 def _as_run_request_args_and_policy(
@@ -404,11 +407,6 @@ def _as_run_request_args_and_policy(
         DragonRunRequestView(
             exe=exe_,
             exe_args=args,
-            # FIXME: Currently this is hard coded because the schema requires
-            #        it, but in future, it is almost certainly necessary that
-            #        this will need to be injected by the user or by us to have
-            #        the command execute next to any generated files. A similar
-            #        problem exists for the other settings.
             path=path,
             env=env,
             # TODO: Not sure how this info is injected
diff --git a/smartsim/_core/launcher/dragon/pqueue.py b/smartsim/_core/launcher/dragon/pqueue.py
new file mode 100644
index 0000000000..8c14a828f5
--- /dev/null
+++ b/smartsim/_core/launcher/dragon/pqueue.py
@@ -0,0 +1,461 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# import collections
+import enum
+import heapq
+import threading
+import typing as t
+
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class Node(t.Protocol):
+    """Base Node API required to support the NodePrioritizer"""
+
+    @property
+    def hostname(self) -> str:
+        """The hostname of the node"""
+
+    @property
+    def num_cpus(self) -> int:
+        """The number of CPUs in the node"""
+
+    @property
+    def num_gpus(self) -> int:
+        """The number of GPUs in the node"""
+
+
+class NodeReferenceCount(t.Protocol):
+    """Contains details pertaining to references to a node"""
+
+    @property
+    def hostname(self) -> str:
+        """The hostname of the node"""
+
+    @property
+    def num_refs(self) -> int:
+        """The number of jobs assigned to the node"""
+
+
+class _TrackedNode:
+    """Node API required to have support in the NodePrioritizer"""
+
+    def __init__(self, node: Node) -> None:
+        self._node = node
+        """The node being tracked"""
+        self._num_refs = 0
+        """The number of references to the tracked node"""
+        self._assigned_tasks: t.Set[str] = set()
+        """The unique identifiers of processes using this node"""
+        self._is_dirty = False
+        """Flag indicating that tracking information has been modified"""
+
+    @property
+    def hostname(self) -> str:
+        """Returns the hostname of the node"""
+        return self._node.hostname
+
+    @property
+    def num_cpus(self) -> int:
+        """Returns the number of CPUs in the node"""
+        return self._node.num_cpus
+
+    @property
+    def num_gpus(self) -> int:
+        """Returns the number of GPUs attached to the node"""
+        return self._node.num_gpus
+
+    @property
+    def num_refs(self) -> int:
+        """Returns the number of processes currently running on the node"""
+        return self._num_refs
+
+    @property
+    def is_assigned(self) -> bool:
+        """Returns `True` if no references are currently counted, `False` otherwise"""
+        return self._num_refs > 0
+
+    @property
+    def assigned_tasks(self) -> t.Set[str]:
+        """Returns the set of unique IDs for currently running processes"""
+        return self._assigned_tasks
+
+    @property
+    def is_dirty(self) -> bool:
+        """Returns a flag indicating if the reference counter has changed. `True`
+        if references have been added or removed, `False` otherwise."""
+        return self._is_dirty
+
+    def clean(self) -> None:
+        """Marks the node as unmodified"""
+        self._is_dirty = False
+
+    def add(
+        self,
+        tracking_id: t.Optional[str] = None,
+    ) -> None:
+        """Update the node to indicate the addition of a process that must be
+        reference counted.
+
+        :param tracking_id: a unique task identifier executing on the node
+        to add
+        :raises ValueError: if tracking_id is already assigned to this node"""
+        if tracking_id in self.assigned_tasks:
+            raise ValueError("Attempted adding task more than once")
+
+        self._num_refs = self._num_refs + 1
+        if tracking_id:
+            self._assigned_tasks = self._assigned_tasks.union({tracking_id})
+        self._is_dirty = True
+
+    def remove(
+        self,
+        tracking_id: t.Optional[str] = None,
+    ) -> None:
+        """Update the reference counter to indicate the removal of a process.
+
+        :param tracking_id: a unique task identifier executing on the node
+        to remove
+        :raises ValueError: if tracking_id is already assigned to this node"""
+        self._num_refs = max(self._num_refs - 1, 0)
+        if tracking_id:
+            self._assigned_tasks = self._assigned_tasks - {tracking_id}
+        self._is_dirty = True
+
+    def __lt__(self, other: "_TrackedNode") -> bool:
+        """Comparison operator used to evaluate the ordering of nodes within
+        the prioritizer. This comparison only considers reference counts.
+
+        :param other: Another node to compare against
+        :returns: True if this node has fewer references than the other node"""
+        if self.num_refs < other.num_refs:
+            return True
+
+        return False
+
+
+class PrioritizerFilter(str, enum.Enum):
+    """A filter used to select a subset of nodes to be queried"""
+
+    CPU = enum.auto()
+    GPU = enum.auto()
+
+
+class NodePrioritizer:
+    def __init__(self, nodes: t.List[Node], lock: threading.RLock) -> None:
+        """Initialize the prioritizer
+
+        :param nodes: node attribute information for initializing the priorizer
+        :param lock: a lock used to ensure threadsafe operations
+        :raises SmartSimError: if the nodes collection is empty
+        """
+        if not nodes:
+            raise SmartSimError("Missing nodes to prioritize")
+
+        self._lock = lock
+        """Lock used to ensure thread safe changes of the reference counters"""
+        self._cpu_refs: t.List[_TrackedNode] = []
+        """Track reference counts to CPU-only nodes"""
+        self._gpu_refs: t.List[_TrackedNode] = []
+        """Track reference counts to GPU nodes"""
+        self._nodes: t.Dict[str, _TrackedNode] = {}
+
+        self._initialize_reference_counters(nodes)
+
+    def _initialize_reference_counters(self, nodes: t.List[Node]) -> None:
+        """Perform initialization of reference counters for nodes in the allocation
+
+        :param nodes: node attribute information for initializing the priorizer"""
+        for node in nodes:
+            # create a set of reference counters for the nodes
+            tracked = _TrackedNode(node)
+
+            self._nodes[node.hostname] = tracked  # for O(1) access
+
+            if node.num_gpus:
+                self._gpu_refs.append(tracked)
+            else:
+                self._cpu_refs.append(tracked)
+
+    def increment(
+        self, host: str, tracking_id: t.Optional[str] = None
+    ) -> NodeReferenceCount:
+        """Directly increment the reference count of a given node and ensure the
+        ref counter is marked as dirty to trigger a reordering on retrieval
+
+        :param host: a hostname that should have a reference counter selected
+        :param tracking_id: a unique task identifier executing on the node
+        to add"""
+        with self._lock:
+            tracked_node = self._nodes[host]
+            tracked_node.add(tracking_id)
+            return tracked_node
+
+    def _heapify_all_refs(self) -> t.List[_TrackedNode]:
+        """Combine the CPU and GPU nodes into a single heap
+
+        :returns: list of all reference counters"""
+        refs = [*self._cpu_refs, *self._gpu_refs]
+        heapq.heapify(refs)
+        return refs
+
+    def get_tracking_info(self, host: str) -> NodeReferenceCount:
+        """Returns the reference counter information for a single node
+
+        :param host: a hostname that should have a reference counter selected
+        :returns: a reference counter for the node
+        :raises ValueError: if the hostname is not in the set of managed nodes"""
+        if host not in self._nodes:
+            raise ValueError("The supplied hostname was not found")
+
+        return self._nodes[host]
+
+    def decrement(
+        self, host: str, tracking_id: t.Optional[str] = None
+    ) -> NodeReferenceCount:
+        """Directly decrement the reference count of a given node and ensure the
+        ref counter is marked as dirty to trigger a reordering
+
+        :param host: a hostname that should have a reference counter decremented
+        :param tracking_id: unique task identifier to remove"""
+        with self._lock:
+            tracked_node = self._nodes[host]
+            tracked_node.remove(tracking_id)
+
+            return tracked_node
+
+    def _create_sub_heap(
+        self,
+        hosts: t.Optional[t.List[str]] = None,
+        filter_on: t.Optional[PrioritizerFilter] = None,
+    ) -> t.List[_TrackedNode]:
+        """Create a new heap from the primary heap with user-specified nodes
+
+        :param hosts: a list of hostnames used to filter the available nodes
+        :returns: a list of assigned reference counters
+        """
+        nodes_tracking_info: t.List[_TrackedNode] = []
+        heap = self._get_filtered_heap(filter_on)
+
+        # Collect all the tracking info for the requested nodes...
+        for node in heap:
+            if not hosts or node.hostname in hosts:
+                nodes_tracking_info.append(node)
+
+        # ... and use it to create a new heap from a specified subset of nodes
+        heapq.heapify(nodes_tracking_info)
+
+        return nodes_tracking_info
+
+    def unassigned(
+        self, heap: t.Optional[t.List[_TrackedNode]] = None
+    ) -> t.Sequence[Node]:
+        """Select nodes that are currently not assigned a task
+
+        :param heap: a subset of the node heap to consider
+        :returns: a list of reference counts for all unassigned nodes"""
+        if heap is None:
+            heap = list(self._nodes.values())
+
+        nodes: t.List[_TrackedNode] = []
+        for item in heap:
+            if item.num_refs == 0:
+                nodes.append(item)
+        return nodes
+
+    def assigned(
+        self, heap: t.Optional[t.List[_TrackedNode]] = None
+    ) -> t.Sequence[Node]:
+        """Helper method to identify the nodes that are currently assigned
+
+        :param heap: a subset of the node heap to consider
+        :returns: a list of reference counts for all assigned nodes"""
+        if heap is None:
+            heap = list(self._nodes.values())
+
+        nodes: t.List[_TrackedNode] = []
+        for item in heap:
+            if item.num_refs > 0:
+                nodes.append(item)
+        return nodes
+
+    def _check_satisfiable_n(
+        self, num_items: int, heap: t.Optional[t.List[_TrackedNode]] = None
+    ) -> bool:
+        """Validates that a request for some number of nodes `n` can be
+        satisfied by the prioritizer given the set of nodes available
+
+        :param num_items: the desired number of nodes to allocate
+        :param heap: a subset of the node heap to consider
+        :returns: True if the request can be fulfilled, False otherwise"""
+        num_nodes = len(self._nodes.keys())
+
+        if num_items < 1:
+            msg = "Cannot handle request; request requires a positive integer"
+            logger.warning(msg)
+            return False
+
+        if num_nodes < num_items:
+            msg = f"Cannot satisfy request for {num_items} nodes; {num_nodes} in pool"
+            logger.warning(msg)
+            return False
+
+        num_open = len(self.unassigned(heap))
+        if num_open < num_items:
+            msg = f"Cannot satisfy request for {num_items} nodes; {num_open} available"
+            logger.warning(msg)
+            return False
+
+        return True
+
+    def _get_next_unassigned_node(
+        self,
+        heap: t.List[_TrackedNode],
+        tracking_id: t.Optional[str] = None,
+    ) -> t.Optional[Node]:
+        """Finds the next node with no running processes and
+        ensures that any elements that were directly updated are updated in
+        the priority structure before being made available
+
+        :param heap: a subset of the node heap to consider
+        :param tracking_id: unique task identifier to track
+        :returns: a reference counter for an available node if an unassigned node
+        exists, `None` otherwise"""
+        tracking_info: t.Optional[_TrackedNode] = None
+
+        with self._lock:
+            # re-sort the heap to handle any tracking changes
+            if any(node.is_dirty for node in heap):
+                heapq.heapify(heap)
+
+            # grab the min node from the heap
+            tracking_info = heapq.heappop(heap)
+
+            # the node is available if it has no assigned tasks
+            is_assigned = tracking_info.is_assigned
+            if not is_assigned:
+                # track the new process on the node
+                tracking_info.add(tracking_id)
+
+            # add the node that was popped back into the heap
+            heapq.heappush(heap, tracking_info)
+
+            # mark all nodes as clean now that everything is updated & sorted
+            for node in heap:
+                node.clean()
+
+            # next available must only return previously unassigned nodes
+            if is_assigned:
+                return None
+
+        return tracking_info
+
+    def _get_next_n_available_nodes(
+        self,
+        num_items: int,
+        heap: t.List[_TrackedNode],
+        tracking_id: t.Optional[str] = None,
+    ) -> t.List[Node]:
+        """Find the next N available nodes w/least amount of references using
+        the supplied filter to target a specific node capability
+
+        :param num_items: number of nodes to reserve
+        :param heap: a subset of the node heap to consider
+        :param tracking_id: unique task identifier to track
+        :returns: a list of reference counters for a available nodes if enough
+        unassigned nodes exists, `None` otherwise
+        :raises ValueError: if the number of requested nodes is not a positive integer
+        """
+        next_nodes: t.List[Node] = []
+
+        if num_items < 1:
+            raise ValueError(f"Number of items requested {num_items} is invalid")
+
+        if not self._check_satisfiable_n(num_items, heap):
+            return next_nodes
+
+        while len(next_nodes) < num_items:
+            if next_node := self._get_next_unassigned_node(heap, tracking_id):
+                next_nodes.append(next_node)
+                continue
+            break
+
+        return next_nodes
+
+    def _get_filtered_heap(
+        self, filter_on: t.Optional[PrioritizerFilter] = None
+    ) -> t.List[_TrackedNode]:
+        """Helper method to select the set of nodes to include in a filtered
+        heap.
+
+        :param filter_on: A list of nodes that satisfy the filter. If no
+        filter is supplied, all nodes are returned"""
+        if filter_on == PrioritizerFilter.GPU:
+            return self._gpu_refs
+        if filter_on == PrioritizerFilter.CPU:
+            return self._cpu_refs
+
+        return self._heapify_all_refs()
+
+    def next(
+        self,
+        filter_on: t.Optional[PrioritizerFilter] = None,
+        tracking_id: t.Optional[str] = None,
+        hosts: t.Optional[t.List[str]] = None,
+    ) -> t.Optional[Node]:
+        """Find the next unsassigned node using the supplied filter to target
+        a specific node capability
+
+        :param filter_on: the subset of nodes to query for available nodes
+        :param tracking_id: unique task identifier to track
+        :param hosts: a list of hostnames used to filter the available nodes
+        :returns: a reference counter for an available node if an unassigned node
+        exists, `None` otherwise"""
+        if results := self.next_n(1, filter_on, tracking_id, hosts):
+            return results[0]
+        return None
+
+    def next_n(
+        self,
+        num_items: int = 1,
+        filter_on: t.Optional[PrioritizerFilter] = None,
+        tracking_id: t.Optional[str] = None,
+        hosts: t.Optional[t.List[str]] = None,
+    ) -> t.List[Node]:
+        """Find the next N available nodes w/least amount of references using
+        the supplied filter to target a specific node capability
+
+        :param num_items: number of nodes to reserve
+        :param filter_on: the subset of nodes to query for available nodes
+        :param tracking_id: unique task identifier to track
+        :param hosts: a list of hostnames used to filter the available nodes
+        :returns: Collection of reserved nodes
+        :raises ValueError: if the hosts parameter is an empty list"""
+        heap = self._create_sub_heap(hosts, filter_on)
+        return self._get_next_n_available_nodes(num_items, heap, tracking_id)
diff --git a/smartsim/_core/launcher/step/alps_step.py b/smartsim/_core/launcher/step/alps_step.py
index 047e75d2cf..dc9f3bff61 100644
--- a/smartsim/_core/launcher/step/alps_step.py
+++ b/smartsim/_core/launcher/step/alps_step.py
@@ -126,14 +126,14 @@ def _build_exe(self) -> t.List[str]:
             return self._make_mpmd()
 
         exe = self.entity.exe
-        args = self.entity.exe_args  # pylint: disable=protected-access
+        args = self.entity.exe_args
         return exe + args
 
     def _make_mpmd(self) -> t.List[str]:
         """Build Aprun (MPMD) executable"""
 
         exe = self.entity.exe
-        exe_args = self.entity._exe_args  # pylint: disable=protected-access
+        exe_args = self.entity.exe_args
         cmd = exe + exe_args
 
         for mpmd in self._get_mpmd():
diff --git a/smartsim/_core/launcher/step/dragon_step.py b/smartsim/_core/launcher/step/dragon_step.py
index 63e9f65fe8..f1e8662e2a 100644
--- a/smartsim/_core/launcher/step/dragon_step.py
+++ b/smartsim/_core/launcher/step/dragon_step.py
@@ -170,6 +170,7 @@ def _write_request_file(self) -> str:
             env = run_settings.env_vars
             nodes = int(run_args.get("nodes", None) or 1)
             tasks_per_node = int(run_args.get("tasks-per-node", None) or 1)
+            hosts_csv = run_args.get("host-list", None)
 
             policy = DragonRunPolicy.from_run_args(run_args)
 
@@ -188,6 +189,7 @@ def _write_request_file(self) -> str:
                 output_file=out,
                 error_file=err,
                 policy=policy,
+                hostlist=hosts_csv,
             )
             requests.append(request_registry.to_string(request))
         with open(request_file, "w", encoding="utf-8") as script_file:
diff --git a/smartsim/_core/launcher/step/lsf_step.py b/smartsim/_core/launcher/step/lsf_step.py
index 372e21c81b..80583129c1 100644
--- a/smartsim/_core/launcher/step/lsf_step.py
+++ b/smartsim/_core/launcher/step/lsf_step.py
@@ -217,7 +217,7 @@ def _build_exe(self) -> t.List[str]:
         :return: executable list
         """
         exe = self.entity.exe
-        args = self.entity.exe_args  # pylint: disable=protected-access
+        args = self.entity.exe_args
 
         if self._get_mpmd():
             erf_file = self.get_step_file(ending=".mpmd")
diff --git a/smartsim/_core/launcher/step/mpi_step.py b/smartsim/_core/launcher/step/mpi_step.py
index 06a94cd4cc..0eb2f34fdb 100644
--- a/smartsim/_core/launcher/step/mpi_step.py
+++ b/smartsim/_core/launcher/step/mpi_step.py
@@ -136,13 +136,13 @@ def _build_exe(self) -> t.List[str]:
             return self._make_mpmd()
 
         exe = self.entity.exe
-        args = self.entity.exe_args  # pylint: disable=protected-access
+        args = self.entity.exe_args
         return exe + args
 
     def _make_mpmd(self) -> t.List[str]:
         """Build mpiexec (MPMD) executable"""
         exe = self.entity.exe
-        args = self.entity.exe_args  # pylint: disable=protected-access
+        args = self.entity.exe_args
         cmd = exe + args
 
         for mpmd in self._get_mpmd():
@@ -150,7 +150,7 @@ def _make_mpmd(self) -> t.List[str]:
             cmd += mpmd.format_run_args()
             cmd += mpmd.format_env_vars()
             cmd += mpmd.exe
-            cmd += mpmd.exe_args  # pylint: disable=protected-access
+            cmd += mpmd.exe_args
 
         cmd = sh_split(" ".join(cmd))
         return cmd
diff --git a/smartsim/_core/launcher/step/slurm_step.py b/smartsim/_core/launcher/step/slurm_step.py
index af042dfc18..410d14d269 100644
--- a/smartsim/_core/launcher/step/slurm_step.py
+++ b/smartsim/_core/launcher/step/slurm_step.py
@@ -211,8 +211,9 @@ def _build_exe(self) -> t.List[str]:
         return exe + args
 
     # There is an issue here, exe and exe_args are no longer attached to the
-    # runsettings. This functions is looping through the list of run_settings.mpmd
-    # and build the variable cmd
+    # runsettings
+    # This functions is looping through the list of run_settings.mpmd and
+    # build the variable cmd
     def _make_mpmd(self) -> t.List[str]:
         """Build Slurm multi-prog (MPMD) executable"""
         exe = self.entity.exe
diff --git a/smartsim/_core/mli/__init__.py b/smartsim/_core/mli/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/smartsim/_core/mli/client/__init__.py b/smartsim/_core/mli/client/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/smartsim/_core/mli/client/protoclient.py b/smartsim/_core/mli/client/protoclient.py
new file mode 100644
index 0000000000..46598a8171
--- /dev/null
+++ b/smartsim/_core/mli/client/protoclient.py
@@ -0,0 +1,348 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# isort: off
+# pylint: disable=unused-import,import-error
+import dragon
+import dragon.channels
+from dragon.globalservices.api_setup import connect_to_infrastructure
+
+try:
+    from mpi4py import MPI  # type: ignore[import-not-found]
+except Exception:
+    MPI = None
+    print("Unable to import `mpi4py` package")
+
+# isort: on
+# pylint: enable=unused-import,import-error
+
+import numbers
+import os
+import time
+import typing as t
+from collections import OrderedDict
+
+import numpy
+import torch
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster
+from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim._core.utils.timings import PerfTimer
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+_TimingDict = OrderedDict[str, list[str]]
+
+
+logger = get_logger("App")
+logger.info("Started app")
+CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False
+
+
+class ProtoClient:
+    """Proof of concept implementation of a client enabling user applications
+    to interact with MLI resources."""
+
+    _DEFAULT_BACKBONE_TIMEOUT = 1.0
+    """A default timeout period applied to connection attempts with the
+    backbone feature store."""
+
+    _DEFAULT_WORK_QUEUE_SIZE = 500
+    """A default number of events to be buffered in the work queue before
+    triggering QueueFull exceptions."""
+
+    _EVENT_SOURCE = "proto-client"
+    """A user-friendly name for this class instance to identify 
+    the client as the publisher of an event."""
+
+    @staticmethod
+    def _attach_to_backbone() -> BackboneFeatureStore:
+        """Use the supplied environment variables to attach
+        to a pre-existing backbone featurestore. Requires the
+        environment to contain `_SMARTSIM_INFRA_BACKBONE`
+        environment variable.
+
+        :returns: The attached backbone featurestore
+        :raises SmartSimError: If the backbone descriptor is not contained
+         in the appropriate environment variable
+        """
+        descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None)
+        if descriptor is None or not descriptor:
+            raise SmartSimError(
+                "Missing required backbone configuration in environment: "
+                f"{BackboneFeatureStore.MLI_BACKBONE}"
+            )
+
+        backbone = t.cast(
+            BackboneFeatureStore, BackboneFeatureStore.from_descriptor(descriptor)
+        )
+        return backbone
+
+    def _attach_to_worker_queue(self) -> DragonFLIChannel:
+        """Wait until the backbone contains the worker queue configuration,
+        then attach an FLI to the given worker queue.
+
+        :returns: The attached FLI channel
+        :raises SmartSimError: if the required configuration is not found in the
+        backbone feature store
+        """
+
+        descriptor = ""
+        try:
+            # NOTE: without wait_for, this MUST be in the backbone....
+            config = self._backbone.wait_for(
+                [BackboneFeatureStore.MLI_WORKER_QUEUE], self.backbone_timeout
+            )
+            descriptor = str(config[BackboneFeatureStore.MLI_WORKER_QUEUE])
+        except Exception as ex:
+            logger.info(
+                f"Unable to retrieve {BackboneFeatureStore.MLI_WORKER_QUEUE} "
+                "to attach to the worker queue."
+            )
+            raise SmartSimError("Unable to locate worker queue using backbone") from ex
+
+        return DragonFLIChannel.from_descriptor(descriptor)
+
+    def _create_broadcaster(self) -> EventBroadcaster:
+        """Create an EventBroadcaster that broadcasts events to
+        all MLI components registered to consume them.
+
+        :returns: An EventBroadcaster instance
+        """
+        broadcaster = EventBroadcaster(
+            self._backbone, DragonCommChannel.from_descriptor
+        )
+        return broadcaster
+
+    def __init__(
+        self,
+        timing_on: bool,
+        backbone_timeout: float = _DEFAULT_BACKBONE_TIMEOUT,
+    ) -> None:
+        """Initialize the client instance.
+
+        :param timing_on: Flag indicating if timing information should be
+         written to file
+        :param backbone_timeout: Maximum wait time (in seconds) allowed to attach to the
+         worker queue
+        :raises SmartSimError: If unable to attach to a backbone featurestore
+        :raises ValueError: If an invalid backbone timeout is specified
+        """
+        if MPI is not None:
+            # TODO: determine a way to make MPI work in the test environment
+            #  - consider catching the import exception and defaulting rank to 0
+            comm = MPI.COMM_WORLD
+            rank: int = comm.Get_rank()
+        else:
+            rank = 0
+
+        if backbone_timeout <= 0:
+            raise ValueError(
+                f"Invalid backbone timeout provided: {backbone_timeout}. "
+                "The value must be greater than zero."
+            )
+        self._backbone_timeout = max(backbone_timeout, 0.1)
+
+        connect_to_infrastructure()
+
+        self._backbone = self._attach_to_backbone()
+        self._backbone.wait_timeout = self.backbone_timeout
+        self._to_worker_fli = self._attach_to_worker_queue()
+
+        self._from_worker_ch = create_local(self._DEFAULT_WORK_QUEUE_SIZE)
+        self._to_worker_ch = create_local(self._DEFAULT_WORK_QUEUE_SIZE)
+
+        self._publisher = self._create_broadcaster()
+
+        self.perf_timer: PerfTimer = PerfTimer(
+            debug=False, timing_on=timing_on, prefix=f"a{rank}_"
+        )
+        self._start: t.Optional[float] = None
+        self._interm: t.Optional[float] = None
+        self._timings: _TimingDict = OrderedDict()
+        self._timing_on = timing_on
+
+    @property
+    def backbone_timeout(self) -> float:
+        """The timeout (in seconds) applied to retrievals
+        from the backbone feature store.
+
+        :returns: A float indicating the number of seconds to allow"""
+        return self._backbone_timeout
+
+    def _add_label_to_timings(self, label: str) -> None:
+        """Adds a new label into the timing dictionary to prepare for
+        receiving timing events.
+
+        :param label: The label to create storage for
+        """
+        if label not in self._timings:
+            self._timings[label] = []
+
+    @staticmethod
+    def _format_number(number: t.Union[numbers.Number, float]) -> str:
+        """Utility function for formatting numbers consistently for logs.
+
+        :param number: The number to convert to a formatted string
+        :returns: The formatted string containing the number
+        """
+        return f"{number:0.4e}"
+
+    def start_timings(self, batch_size: numbers.Number) -> None:
+        """Configure the client to begin storing timing information.
+
+        :param batch_size: The size of batches to generate as inputs
+         to the model
+        """
+        if self._timing_on:
+            self._add_label_to_timings("batch_size")
+            self._timings["batch_size"].append(self._format_number(batch_size))
+            self._start = time.perf_counter()
+            self._interm = time.perf_counter()
+
+    def end_timings(self) -> None:
+        """Configure the client to stop storing timing information."""
+        if self._timing_on and self._start is not None:
+            self._add_label_to_timings("total_time")
+            self._timings["total_time"].append(
+                self._format_number(time.perf_counter() - self._start)
+            )
+
+    def measure_time(self, label: str) -> None:
+        """Measures elapsed time since the last recorded signal.
+
+        :param label: The label to measure time for
+        """
+        if self._timing_on and self._interm is not None:
+            self._add_label_to_timings(label)
+            self._timings[label].append(
+                self._format_number(time.perf_counter() - self._interm)
+            )
+            self._interm = time.perf_counter()
+
+    def print_timings(self, to_file: bool = False) -> None:
+        """Print timing information to standard output. If `to_file`
+        is `True`, also write results to a file.
+
+        :param to_file: If `True`, also saves timing information
+         to the files `timings.npy` and `timings.txt`
+        """
+        print(" ".join(self._timings.keys()))
+
+        value_array = numpy.array(self._timings.values(), dtype=float)
+        value_array = numpy.transpose(value_array)
+        for i in range(value_array.shape[0]):
+            print(" ".join(self._format_number(value) for value in value_array[i]))
+        if to_file:
+            numpy.save("timings.npy", value_array)
+            numpy.savetxt("timings.txt", value_array)
+
+    def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any:
+        """Execute a batch of inference requests with the supplied ML model.
+
+        :param model: The raw bytes or path to a pytorch model
+        :param batch: The tensor batch to perform inference on
+        :returns: The inference results
+        :raises ValueError: if the worker queue is not configured properly
+        in the environment variables
+        """
+        tensors = [batch.numpy()]
+        self.perf_timer.start_timings("batch_size", batch.shape[0])
+        built_tensor_desc = MessageHandler.build_tensor_descriptor(
+            "c", "float32", list(batch.shape)
+        )
+        self.perf_timer.measure_time("build_tensor_descriptor")
+        if isinstance(model, str):
+            model_arg = MessageHandler.build_model_key(model, self._backbone.descriptor)
+        else:
+            model_arg = MessageHandler.build_model(
+                model, "resnet-50", "1.0"
+            )  # type: ignore
+        request = MessageHandler.build_request(
+            reply_channel=self._from_worker_ch.descriptor,
+            model=model_arg,
+            inputs=[built_tensor_desc],
+            outputs=[],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+        self.perf_timer.measure_time("build_request")
+        request_bytes = MessageHandler.serialize_request(request)
+        self.perf_timer.measure_time("serialize_request")
+
+        if self._to_worker_fli is None:
+            raise ValueError("No worker queue available.")
+
+        # pylint: disable-next=protected-access
+        with self._to_worker_fli._channel.sendh(  # type: ignore
+            timeout=None,
+            stream_channel=self._to_worker_ch.channel,
+        ) as to_sendh:
+            to_sendh.send_bytes(request_bytes)
+            self.perf_timer.measure_time("send_request")
+            for tensor in tensors:
+                to_sendh.send_bytes(tensor.tobytes())  # TODO NOT FAST ENOUGH!!!
+        logger.info(f"Message size: {len(request_bytes)} bytes")
+
+        self.perf_timer.measure_time("send_tensors")
+        with self._from_worker_ch.channel.recvh(timeout=None) as from_recvh:
+            resp = from_recvh.recv_bytes(timeout=None)
+            self.perf_timer.measure_time("receive_response")
+            response = MessageHandler.deserialize_response(resp)
+            self.perf_timer.measure_time("deserialize_response")
+
+            # recv depending on the len(response.result.descriptors)?
+            data_blob: bytes = from_recvh.recv_bytes(timeout=None)
+            self.perf_timer.measure_time("receive_tensor")
+            result = torch.from_numpy(
+                numpy.frombuffer(
+                    data_blob,
+                    dtype=str(response.result.descriptors[0].dataType),
+                )
+            )
+            self.perf_timer.measure_time("deserialize_tensor")
+
+        self.perf_timer.end_timings()
+        return result
+
+    def set_model(self, key: str, model: bytes) -> None:
+        """Write the supplied model to the feature store.
+
+        :param key: The unique key used to identify the model
+        :param model: The raw bytes of the model to execute
+        """
+        self._backbone[key] = model
+
+        # notify components of a change in the data at this key
+        event = OnWriteFeatureStore(self._EVENT_SOURCE, self._backbone.descriptor, key)
+        self._publisher.send(event)
diff --git a/smartsim/_core/mli/comm/channel/__init__.py b/smartsim/_core/mli/comm/channel/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py
new file mode 100644
index 0000000000..104333ce7f
--- /dev/null
+++ b/smartsim/_core/mli/comm/channel/channel.py
@@ -0,0 +1,82 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import base64
+import typing as t
+import uuid
+from abc import ABC, abstractmethod
+
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class CommChannelBase(ABC):
+    """Base class for abstracting a message passing mechanism"""
+
+    def __init__(
+        self,
+        descriptor: str,
+        name: t.Optional[str] = None,
+    ) -> None:
+        """Initialize the CommChannel instance.
+
+        :param descriptor: Channel descriptor
+        """
+        self._descriptor = descriptor
+        """An opaque identifier used to connect to an underlying communication channel"""
+        self._name = name or str(uuid.uuid4())
+        """A user-friendly identifier for channel-related logging"""
+
+    @abstractmethod
+    def send(self, value: bytes, timeout: float = 0.001) -> None:
+        """Send a message through the underlying communication channel.
+
+        :param value: The value to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :raises SmartSimError: If sending message fails
+        """
+
+    @abstractmethod
+    def recv(self, timeout: float = 0.001) -> t.List[bytes]:
+        """Receives message(s) through the underlying communication channel.
+
+        :param timeout: Maximum time to wait (in seconds) for messages to arrive
+        :returns: The received message
+        """
+
+    @property
+    def descriptor(self) -> str:
+        """Return the channel descriptor for the underlying dragon channel.
+
+        :returns: Byte encoded channel descriptor
+        """
+        return self._descriptor
+
+    def __str__(self) -> str:
+        """Build a string representation of the channel useful for printing."""
+        classname = type(self).__class__.__name__
+        return f"{classname}('{self._name}', '{self._descriptor}')"
diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py
new file mode 100644
index 0000000000..110f19258a
--- /dev/null
+++ b/smartsim/_core/mli/comm/channel/dragon_channel.py
@@ -0,0 +1,127 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+import dragon.channels as dch
+
+import smartsim._core.mli.comm.channel.channel as cch
+import smartsim._core.mli.comm.channel.dragon_util as drg_util
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class DragonCommChannel(cch.CommChannelBase):
+    """Passes messages by writing to a Dragon channel."""
+
+    def __init__(self, channel: "dch.Channel") -> None:
+        """Initialize the DragonCommChannel instance.
+
+        :param channel: A channel to use for communications
+        """
+        descriptor = drg_util.channel_to_descriptor(channel)
+        super().__init__(descriptor)
+        self._channel = channel
+        """The underlying dragon channel used by this CommChannel for communications"""
+
+    @property
+    def channel(self) -> "dch.Channel":
+        """The underlying communication channel.
+
+        :returns: The channel
+        """
+        return self._channel
+
+    def send(self, value: bytes, timeout: float = 0.001) -> None:
+        """Send a message through the underlying communication channel.
+
+        :param value: The value to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :raises SmartSimError: If sending message fails
+        """
+        try:
+            with self._channel.sendh(timeout=timeout) as sendh:
+                sendh.send_bytes(value, blocking=False)
+                logger.debug(f"DragonCommChannel {self.descriptor} sent message")
+        except Exception as e:
+            raise SmartSimError(
+                f"Error sending via DragonCommChannel {self.descriptor}"
+            ) from e
+
+    def recv(self, timeout: float = 0.001) -> t.List[bytes]:
+        """Receives message(s) through the underlying communication channel.
+
+        :param timeout: Maximum time to wait (in seconds) for messages to arrive
+        :returns: The received message(s)
+        """
+        with self._channel.recvh(timeout=timeout) as recvh:
+            messages: t.List[bytes] = []
+
+            try:
+                message_bytes = recvh.recv_bytes(timeout=timeout)
+                messages.append(message_bytes)
+                logger.debug(f"DragonCommChannel {self.descriptor} received message")
+            except dch.ChannelEmpty:
+                # emptied the queue, ok to swallow this ex
+                logger.debug(f"DragonCommChannel exhausted: {self.descriptor}")
+            except dch.ChannelRecvTimeout:
+                logger.debug(f"Timeout exceeded on channel.recv: {self.descriptor}")
+
+            return messages
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "DragonCommChannel":
+        """A factory method that creates an instance from a descriptor string.
+
+        :param descriptor: The descriptor that uniquely identifies the resource.
+        :returns: An attached DragonCommChannel
+        :raises SmartSimError: If creation of comm channel fails
+        """
+        try:
+            channel = drg_util.descriptor_to_channel(descriptor)
+            return DragonCommChannel(channel)
+        except Exception as ex:
+            raise SmartSimError(
+                f"Failed to create dragon comm channel: {descriptor}"
+            ) from ex
+
+    @classmethod
+    def from_local(cls, _descriptor: t.Optional[str] = None) -> "DragonCommChannel":
+        """A factory method that creates a local channel instance.
+
+        :param _descriptor: Unused placeholder
+        :returns: An attached DragonCommChannel"""
+        try:
+            channel = drg_util.create_local()
+            return DragonCommChannel(channel)
+        except:
+            logger.error(f"Failed to create local dragon comm channel", exc_info=True)
+            raise
diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py
new file mode 100644
index 0000000000..01849247cd
--- /dev/null
+++ b/smartsim/_core/mli/comm/channel/dragon_fli.py
@@ -0,0 +1,160 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# isort: off
+
+import dragon
+import dragon.fli as fli
+from dragon.channels import Channel
+
+# isort: on
+
+import typing as t
+
+import smartsim._core.mli.comm.channel.channel as cch
+import smartsim._core.mli.comm.channel.dragon_util as drg_util
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class DragonFLIChannel(cch.CommChannelBase):
+    """Passes messages by writing to a Dragon FLI Channel."""
+
+    def __init__(
+        self,
+        fli_: fli.FLInterface,
+        buffer_size: int = drg_util.DEFAULT_CHANNEL_BUFFER_SIZE,
+    ) -> None:
+        """Initialize the DragonFLIChannel instance.
+
+        :param fli_: The FLIInterface to use as the underlying communications channel
+        :param sender_supplied: Flag indicating if the FLI uses sender-supplied streams
+        :param buffer_size: Maximum number of sent messages that can be buffered
+        """
+        descriptor = drg_util.channel_to_descriptor(fli_)
+        super().__init__(descriptor)
+
+        self._channel: t.Optional["Channel"] = None
+        """The underlying dragon Channel used by a sender-side DragonFLIChannel
+        to attach to the main FLI channel"""
+
+        self._fli = fli_
+        """The underlying dragon FLInterface used by this CommChannel for communications"""
+        self._buffer_size: int = buffer_size
+        """Maximum number of messages that can be buffered before sending"""
+
+    def send(self, value: bytes, timeout: float = 0.001) -> None:
+        """Send a message through the underlying communication channel.
+
+        :param value: The value to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :raises SmartSimError: If sending message fails
+        """
+        try:
+            if self._channel is None:
+                self._channel = drg_util.create_local(self._buffer_size)
+
+            with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh:
+                sendh.send_bytes(value, timeout=timeout)
+                logger.debug(f"DragonFLIChannel {self.descriptor} sent message")
+        except Exception as e:
+            self._channel = None
+            raise SmartSimError(
+                f"Error sending via DragonFLIChannel {self.descriptor}"
+            ) from e
+
+    def send_multiple(
+        self,
+        values: t.Sequence[bytes],
+        timeout: float = 0.001,
+    ) -> None:
+        """Send a message through the underlying communication channel.
+
+        :param values: The values to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :raises SmartSimError: If sending message fails
+        """
+        try:
+            if self._channel is None:
+                self._channel = drg_util.create_local(self._buffer_size)
+
+            with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh:
+                for value in values:
+                    sendh.send_bytes(value)
+                    logger.debug(f"DragonFLIChannel {self.descriptor} sent message")
+        except Exception as e:
+            self._channel = None
+            raise SmartSimError(
+                f"Error sending via DragonFLIChannel {self.descriptor} {e}"
+            ) from e
+
+    def recv(self, timeout: float = 0.001) -> t.List[bytes]:
+        """Receives message(s) through the underlying communication channel.
+
+        :param timeout: Maximum time to wait (in seconds) for messages to arrive
+        :returns: The received message(s)
+        :raises SmartSimError: If receiving message(s) fails
+        """
+        messages = []
+        eot = False
+        with self._fli.recvh(timeout=timeout) as recvh:
+            while not eot:
+                try:
+                    message, _ = recvh.recv_bytes(timeout=timeout)
+                    messages.append(message)
+                    logger.debug(f"DragonFLIChannel {self.descriptor} received message")
+                except fli.FLIEOT:
+                    eot = True
+                    logger.debug(f"DragonFLIChannel exhausted: {self.descriptor}")
+                except Exception as e:
+                    raise SmartSimError(
+                        f"Error receiving messages: DragonFLIChannel {self.descriptor}"
+                    ) from e
+        return messages
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "DragonFLIChannel":
+        """A factory method that creates an instance from a descriptor string.
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached DragonFLIChannel
+        :raises SmartSimError: If creation of DragonFLIChannel fails
+        :raises ValueError: If the descriptor is invalid
+        """
+        if not descriptor:
+            raise ValueError("Invalid descriptor provided")
+
+        try:
+            return DragonFLIChannel(fli_=drg_util.descriptor_to_fli(descriptor))
+        except Exception as e:
+            raise SmartSimError(
+                f"Error while creating DragonFLIChannel: {descriptor}"
+            ) from e
diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py
new file mode 100644
index 0000000000..8517979ec4
--- /dev/null
+++ b/smartsim/_core/mli/comm/channel/dragon_util.py
@@ -0,0 +1,131 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import base64
+import binascii
+import typing as t
+
+import dragon.channels as dch
+import dragon.fli as fli
+import dragon.managed_memory as dm
+
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+DEFAULT_CHANNEL_BUFFER_SIZE = 500
+"""Maximum number of messages that can be buffered. DragonCommChannel will
+raise an exception if no clients consume messages before the buffer is filled."""
+
+LAST_OFFSET = 0
+"""The last offset used to create a local channel. This is used to avoid
+unnecessary retries when creating a local channel."""
+
+
+def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str:
+    """Convert a dragon channel to a descriptor string.
+
+    :param channel: The dragon channel to convert
+    :returns: The descriptor string
+    :raises ValueError: If a dragon channel is not provided
+    """
+    if channel is None:
+        raise ValueError("Channel is not available to create a descriptor")
+
+    serialized_ch = channel.serialize()
+    return base64.b64encode(serialized_ch).decode("utf-8")
+
+
+def pool_to_descriptor(pool: dm.MemoryPool) -> str:
+    """Convert a dragon memory pool to a descriptor string.
+
+    :param pool: The memory pool to convert
+    :returns: The descriptor string
+    :raises ValueError: If a memory pool is not provided
+    """
+    if pool is None:
+        raise ValueError("Memory pool is not available to create a descriptor")
+
+    serialized_pool = pool.serialize()
+    return base64.b64encode(serialized_pool).decode("utf-8")
+
+
+def descriptor_to_fli(descriptor: str) -> "fli.FLInterface":
+    """Create and attach a new FLI instance given
+    the string-encoded descriptor.
+
+    :param descriptor: The descriptor of an FLI to attach to
+    :returns: The attached dragon FLI
+    :raises ValueError: If the descriptor is empty or incorrectly formatted
+    :raises SmartSimError: If attachment using the descriptor fails
+    """
+    if len(descriptor) < 1:
+        raise ValueError("Descriptors may not be empty")
+
+    try:
+        encoded = descriptor.encode("utf-8")
+        descriptor_ = base64.b64decode(encoded)
+        return fli.FLInterface.attach(descriptor_)
+    except binascii.Error:
+        raise ValueError("The descriptor was not properly base64 encoded")
+    except fli.DragonFLIError:
+        raise SmartSimError("The descriptor did not address an available FLI")
+
+
+def descriptor_to_channel(descriptor: str) -> dch.Channel:
+    """Create and attach a new Channel instance given
+    the string-encoded descriptor.
+
+    :param descriptor: The descriptor of a channel to attach to
+    :returns: The attached dragon Channel
+    :raises ValueError: If the descriptor is empty or incorrectly formatted
+    :raises SmartSimError: If attachment using the descriptor fails
+    """
+    if len(descriptor) < 1:
+        raise ValueError("Descriptors may not be empty")
+
+    try:
+        encoded = descriptor.encode("utf-8")
+        descriptor_ = base64.b64decode(encoded)
+        return dch.Channel.attach(descriptor_)
+    except binascii.Error:
+        raise ValueError("The descriptor was not properly base64 encoded")
+    except dch.ChannelError:
+        raise SmartSimError("The descriptor did not address an available channel")
+
+
+def create_local(_capacity: int = 0) -> dch.Channel:
+    """Creates a Channel attached to the local memory pool. Replacement for
+    direct calls to `dch.Channel.make_process_local()` to enable
+    supplying a channel capacity.
+
+    :param _capacity: The number of events the channel can buffer; uses the default
+    buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied
+    :returns: The instantiated channel
+    """
+    channel = dch.Channel.make_process_local()
+    return channel
diff --git a/smartsim/_core/mli/infrastructure/__init__.py b/smartsim/_core/mli/infrastructure/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/smartsim/_core/mli/infrastructure/comm/__init__.py b/smartsim/_core/mli/infrastructure/comm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/smartsim/_core/mli/infrastructure/comm/broadcaster.py b/smartsim/_core/mli/infrastructure/comm/broadcaster.py
new file mode 100644
index 0000000000..56dcf549f7
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/comm/broadcaster.py
@@ -0,0 +1,239 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+import uuid
+from collections import defaultdict, deque
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.infrastructure.comm.event import EventBase
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class BroadcastResult(t.NamedTuple):
+    """Contains summary details about a broadcast."""
+
+    num_sent: int
+    """The total number of messages delivered across all consumers"""
+    num_failed: int
+    """The total number of messages not delivered across all consumers"""
+
+
+class EventBroadcaster:
+    """Performs fan-out publishing of system events."""
+
+    def __init__(
+        self,
+        backbone: BackboneFeatureStore,
+        channel_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None,
+        name: t.Optional[str] = None,
+    ) -> None:
+        """Initialize the EventPublisher instance.
+
+        :param backbone: The MLI backbone feature store
+        :param channel_factory: Factory method to construct new channel instances
+        :param name: A user-friendly name for logging. If not provided, an
+        auto-generated GUID will be used
+        """
+        self._backbone = backbone
+        """The backbone feature store used to retrieve consumer descriptors"""
+        self._channel_factory = channel_factory
+        """A factory method used to instantiate channels from descriptors"""
+        self._channel_cache: t.Dict[str, t.Optional[CommChannelBase]] = defaultdict(
+            lambda: None
+        )
+        """A mapping of instantiated channels that can be re-used. Automatically 
+        calls the channel factory if a descriptor is not already in the collection"""
+        self._event_buffer: t.Deque[EventBase] = deque()
+        """A buffer for storing events when a consumer list is not found"""
+        self._descriptors: t.Set[str]
+        """Stores the most recent list of broadcast consumers. Updated automatically
+        on each broadcast"""
+        self._name = name or str(uuid.uuid4())
+        """A unique identifer assigned to the broadcaster for logging"""
+
+    @property
+    def name(self) -> str:
+        """The friendly name assigned to the broadcaster.
+
+        :returns: The broadcaster name if one is assigned, otherwise a unique
+        id assigned by the system.
+        """
+        return self._name
+
+    @property
+    def num_buffered(self) -> int:
+        """Return the number of events currently buffered to send.
+
+        :returns: Number of buffered events
+        """
+        return len(self._event_buffer)
+
+    def _save_to_buffer(self, event: EventBase) -> None:
+        """Places the event in the buffer to be sent once a consumer
+        list is available.
+
+        :param event: The event to buffer
+        :raises ValueError: If the event cannot be buffered
+        """
+        try:
+            self._event_buffer.append(event)
+            logger.debug(f"Buffered event {event=}")
+        except Exception as ex:
+            raise ValueError(
+                f"Unable to buffer event {event} in broadcaster {self.name}"
+            ) from ex
+
+    def _log_broadcast_start(self) -> None:
+        """Logs broadcast statistics."""
+        num_events = len(self._event_buffer)
+        num_copies = len(self._descriptors)
+        logger.debug(
+            f"Broadcast {num_events} events to {num_copies} consumers from {self.name}"
+        )
+
+    def _prune_unused_consumers(self) -> None:
+        """Performs maintenance on the channel cache by pruning any channel
+        that has been removed from the consumers list."""
+        active_consumers = set(self._descriptors)
+        current_channels = set(self._channel_cache.keys())
+
+        # find any cached channels that are now unused
+        inactive_channels = current_channels.difference(active_consumers)
+        new_channels = active_consumers.difference(current_channels)
+
+        for descriptor in inactive_channels:
+            self._channel_cache.pop(descriptor)
+
+        logger.debug(
+            f"Pruning {len(inactive_channels)} stale consumers and"
+            f" found {len(new_channels)} new channels for {self.name}"
+        )
+
+    def _get_comm_channel(self, descriptor: str) -> CommChannelBase:
+        """Helper method to build and cache a comm channel.
+
+        :param descriptor: The descriptor to pass to the channel factory
+        :returns: The instantiated channel
+        :raises SmartSimError: If the channel fails to attach
+        """
+        comm_channel = self._channel_cache[descriptor]
+        if comm_channel is not None:
+            return comm_channel
+
+        if self._channel_factory is None:
+            raise SmartSimError("No channel factory provided for consumers")
+
+        try:
+            channel = self._channel_factory(descriptor)
+            self._channel_cache[descriptor] = channel
+            return channel
+        except Exception as ex:
+            msg = f"Unable to construct channel with descriptor: {descriptor}"
+            logger.error(msg, exc_info=True)
+            raise SmartSimError(msg) from ex
+
+    def _get_next_event(self) -> t.Optional[EventBase]:
+        """Pop the next event to be sent from the queue.
+
+        :returns: The next event to send if any events are enqueued, otherwise `None`.
+        """
+        try:
+            return self._event_buffer.popleft()
+        except IndexError:
+            logger.debug(f"Broadcast buffer exhausted for {self.name}")
+
+        return None
+
+    def _broadcast(self, timeout: float = 0.001) -> BroadcastResult:
+        """Broadcasts all buffered events to registered event consumers.
+
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :returns: BroadcastResult containing the number of messages that were
+        successfully and unsuccessfully sent for all consumers
+        :raises SmartSimError: If the channel fails to attach or broadcasting fails
+        """
+        # allow descriptors to be empty since events are buffered
+        self._descriptors = set(x for x in self._backbone.notification_channels if x)
+        if not self._descriptors:
+            msg = f"No event consumers are registered for {self.name}"
+            logger.warning(msg)
+            return BroadcastResult(0, 0)
+
+        self._prune_unused_consumers()
+        self._log_broadcast_start()
+
+        num_listeners = len(self._descriptors)
+        num_sent = 0
+        num_failures = 0
+
+        # send each event to every consumer
+        while event := self._get_next_event():
+            logger.debug(f"Broadcasting {event=} to {num_listeners} listeners")
+            event_bytes = bytes(event)
+
+            for i, descriptor in enumerate(self._descriptors):
+                comm_channel = self._get_comm_channel(descriptor)
+
+                try:
+                    comm_channel.send(event_bytes, timeout)
+                    num_sent += 1
+                except Exception:
+                    msg = (
+                        f"Broadcast {i+1}/{num_listeners} for event {event.uid} to "
+                        f"channel {descriptor} from {self.name} failed."
+                    )
+                    logger.exception(msg)
+                    num_failures += 1
+
+        return BroadcastResult(num_sent, num_failures)
+
+    def send(self, event: EventBase, timeout: float = 0.001) -> int:
+        """Implementation of `send` method of the `EventPublisher` protocol. Publishes
+        the supplied event to all registered broadcast consumers.
+
+        :param event: An event to publish
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :returns: The total number of events successfully published to consumers
+        :raises ValueError: If event serialization fails
+        :raises AttributeError: If event cannot be serialized
+        :raises KeyError: If channel fails to attach using registered descriptors
+        :raises SmartSimError: If any unexpected error occurs during send
+        """
+        try:
+            self._save_to_buffer(event)
+            result = self._broadcast(timeout)
+            return result.num_sent
+        except (KeyError, ValueError, AttributeError, SmartSimError):
+            raise
+        except Exception as ex:
+            raise SmartSimError("An unexpected failure occurred while sending") from ex
diff --git a/smartsim/_core/mli/infrastructure/comm/consumer.py b/smartsim/_core/mli/infrastructure/comm/consumer.py
new file mode 100644
index 0000000000..08b5c47852
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/comm/consumer.py
@@ -0,0 +1,281 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pickle
+import time
+import typing as t
+import uuid
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.infrastructure.comm.event import (
+    EventBase,
+    OnCreateConsumer,
+    OnRemoveConsumer,
+    OnShutdownRequested,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class EventConsumer:
+    """Reads system events published to a communications channel."""
+
+    _BACKBONE_WAIT_TIMEOUT = 10.0
+    """Maximum time (in seconds) to wait for the backbone to register the consumer"""
+
+    def __init__(
+        self,
+        comm_channel: CommChannelBase,
+        backbone: BackboneFeatureStore,
+        filters: t.Optional[t.List[str]] = None,
+        name: t.Optional[str] = None,
+        event_handler: t.Optional[t.Callable[[EventBase], None]] = None,
+    ) -> None:
+        """Initialize the EventConsumer instance.
+
+        :param comm_channel: Communications channel to listen to for events
+        :param backbone: The MLI backbone feature store
+        :param filters: A list of event types to deliver. when empty, all
+        events will be delivered
+        :param name: A user-friendly name for logging. If not provided, an
+        auto-generated GUID will be used
+        """
+        self._comm_channel = comm_channel
+        """The comm channel used by the consumer to receive messages. The channel
+        descriptor will be published for senders to discover."""
+        self._backbone = backbone
+        """The backbone instance used to bootstrap the instance. The EventConsumer
+        uses the backbone to discover where it can publish its descriptor."""
+        self._global_filters = filters or []
+        """A set of global filters to apply to incoming events. Global filters are
+        combined with per-call filters. Filters act as an allow-list."""
+        self._name = name or str(uuid.uuid4())
+        """User-friendly name assigned to a consumer for logging. Automatically
+        assigned if not provided."""
+        self._event_handler = event_handler
+        """The function that should be executed when an event
+        passed by the filters is received."""
+        self.listening = True
+        """Flag indicating that the consumer is currently listening for new
+        events. Setting this flag to `False` will cause any active calls to
+        `listen` to terminate."""
+
+    @property
+    def descriptor(self) -> str:
+        """The descriptor of the underlying comm channel.
+
+        :returns: The comm channel descriptor"""
+        return self._comm_channel.descriptor
+
+    @property
+    def name(self) -> str:
+        """The friendly name assigned to the consumer.
+
+        :returns: The consumer name if one is assigned, otherwise a unique
+        id assigned by the system.
+        """
+        return self._name
+
+    def recv(
+        self,
+        filters: t.Optional[t.List[str]] = None,
+        timeout: float = 0.001,
+        batch_timeout: float = 1.0,
+    ) -> t.List[EventBase]:
+        """Receives available published event(s).
+
+        :param filters: Additional filters to add to the global filters configured
+        on the EventConsumer instance
+        :param timeout: Maximum time to wait for a single message to arrive
+        :param batch_timeout: Maximum time to wait for messages to arrive; allows
+        multiple batches to be retrieved in one call to `send`
+        :returns: A list of events that pass any configured filters
+        :raises ValueError: If a positive, non-zero value is not provided for the
+        timeout or batch_timeout.
+        """
+        if filters is None:
+            filters = []
+
+        if timeout is not None and timeout <= 0:
+            raise ValueError("request timeout must be a non-zero, positive value")
+
+        if batch_timeout is not None and batch_timeout <= 0:
+            raise ValueError("batch_timeout must be a non-zero, positive value")
+
+        filter_set = {*self._global_filters, *filters}
+        all_message_bytes: t.List[bytes] = []
+
+        # firehose as many messages as possible within the batch_timeout
+        start_at = time.time()
+        remaining = batch_timeout
+
+        batch_message_bytes = self._comm_channel.recv(timeout=timeout)
+        while batch_message_bytes:
+            # remove any empty messages that will fail to decode
+            all_message_bytes.extend(batch_message_bytes)
+            batch_message_bytes = []
+
+            # avoid getting stuck indefinitely waiting for the channel
+            elapsed = time.time() - start_at
+            remaining = batch_timeout - elapsed
+
+            if remaining > 0:
+                batch_message_bytes = self._comm_channel.recv(timeout=timeout)
+
+        events_received: t.List[EventBase] = []
+
+        # Timeout elapsed or no messages received - return the empty list
+        if not all_message_bytes:
+            return events_received
+
+        for message in all_message_bytes:
+            if not message or message is None:
+                continue
+
+            event = pickle.loads(message)
+            if not event:
+                logger.warning(f"Consumer {self.name} is unable to unpickle message")
+                continue
+
+            # skip events that don't pass a filter
+            if filter_set and event.category not in filter_set:
+                continue
+
+            events_received.append(event)
+
+        return events_received
+
+    def _send_to_registrar(self, event: EventBase) -> None:
+        """Send an event direct to the registrar listener."""
+        registrar_key = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER
+        config = self._backbone.wait_for([registrar_key], self._BACKBONE_WAIT_TIMEOUT)
+        registrar_descriptor = str(config.get(registrar_key, None))
+
+        if not registrar_descriptor:
+            logger.warning(
+                f"Unable to send {event.category} from {self.name}. "
+                "No registrar channel found."
+            )
+            return
+
+        logger.debug(f"Sending {event.category} from {self.name}")
+
+        registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor)
+        registrar_channel.send(bytes(event), timeout=1.0)
+
+        logger.debug(f"{event.category} from {self.name} sent")
+
+    def register(self) -> None:
+        """Send an event to register this consumer as a listener."""
+        descriptor = self._comm_channel.descriptor
+        event = OnCreateConsumer(self.name, descriptor, self._global_filters)
+
+        self._send_to_registrar(event)
+
+    def unregister(self) -> None:
+        """Send an event to un-register this consumer as a listener."""
+        descriptor = self._comm_channel.descriptor
+        event = OnRemoveConsumer(self.name, descriptor)
+
+        self._send_to_registrar(event)
+
+    def _on_handler_missing(self, event: EventBase) -> None:
+        """A "dead letter" event handler that is called to perform
+        processing on events before they're discarded.
+
+        :param event: The event to handle
+        """
+        logger.warning(
+            "No event handler is registered in consumer "
+            f"{self.name}. Discarding {event=}"
+        )
+
+    def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None:
+        """Receives messages for the consumer a single time. Delivers
+        all messages that pass the consumer filters. Shutdown requests
+        are handled by a default event handler.
+
+
+        NOTE: Executes a single batch-retrieval to receive the maximum
+        number of messages available under batch timeout. To continually
+        listen, use `listen` in a non-blocking thread/process
+
+        :param timeout: Maximum time to wait (in seconds) for a message to arrive
+        :param timeout: Maximum time to wait (in seconds) for a batch to arrive
+        """
+        logger.info(
+            f"Consumer {self.name} listening with {timeout} second timeout"
+            f" on channel {self._comm_channel.descriptor}"
+        )
+
+        if not self._event_handler:
+            logger.info("Unable to handle messages. No event handler is registered.")
+
+        incoming_messages = self.recv(timeout=timeout, batch_timeout=batch_timeout)
+
+        if not incoming_messages:
+            logger.info(f"Consumer {self.name} received empty message list")
+
+        for message in incoming_messages:
+            logger.info(f"Consumer {self.name} is handling event {message=}")
+            self._handle_shutdown(message)
+
+            if self._event_handler:
+                self._event_handler(message)
+            else:
+                self._on_handler_missing(message)
+
+    def _handle_shutdown(self, event: EventBase) -> bool:
+        """Handles shutdown requests sent to the consumer by setting the
+        `self.listener` property to `False`.
+
+        :param event: The event to handle
+        :returns: A bool indicating if the event was a shutdown request
+        """
+        if isinstance(event, OnShutdownRequested):
+            logger.debug(f"Shutdown requested from: {event.source}")
+            self.listening = False
+            return True
+        return False
+
+    def listen(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None:
+        """Receives messages for the consumer until a shutdown request is received.
+
+        :param timeout: Maximum time to wait (in seconds) for a message to arrive
+        :param batch_timeout: Maximum time to wait (in seconds) for a batch to arrive
+        """
+
+        logger.debug(f"Consumer {self.name} is now listening for events.")
+
+        while self.listening:
+            self.listen_once(timeout, batch_timeout)
+
+        logger.debug(f"Consumer {self.name} is no longer listening.")
diff --git a/smartsim/_core/mli/infrastructure/comm/event.py b/smartsim/_core/mli/infrastructure/comm/event.py
new file mode 100644
index 0000000000..ccef9f9b86
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/comm/event.py
@@ -0,0 +1,162 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pickle
+import typing as t
+import uuid
+from dataclasses import dataclass, field
+
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class EventBase:
+    """Core API for an event."""
+
+    category: str
+    """Unique category name for an event class"""
+    source: str
+    """A unique identifier for the publisher of the event"""
+    uid: str = field(default_factory=lambda: str(uuid.uuid4()))
+    """A unique identifier for this event"""
+
+    def __bytes__(self) -> bytes:
+        """Default conversion to bytes for an event required to publish
+        messages using byte-oriented communication channels.
+
+        :returns: This entity encoded as bytes"""
+        return pickle.dumps(self)
+
+    def __str__(self) -> str:
+        """Convert the event to a string.
+
+        :returns: A string representation of this instance"""
+        return f"{self.uid}|{self.category}"
+
+
+class OnShutdownRequested(EventBase):
+    """Publish this event to trigger the listener to shutdown."""
+
+    SHUTDOWN: t.ClassVar[str] = "consumer-unregister"
+    """Unique category name for an event raised when a new consumer is unregistered"""
+
+    def __init__(self, source: str) -> None:
+        """Initialize the event instance.
+
+        :param source: A unique identifier for the publisher of the event
+        creating the event
+        """
+        super().__init__(self.SHUTDOWN, source)
+
+
+class OnCreateConsumer(EventBase):
+    """Publish this event when a new event consumer registration is required."""
+
+    descriptor: str
+    """Descriptor of the comm channel exposed by the consumer"""
+    filters: t.List[str] = field(default_factory=list)
+    """The collection of filters indicating messages of interest to this consumer"""
+
+    CONSUMER_CREATED: t.ClassVar[str] = "consumer-created"
+    """Unique category name for an event raised when a new consumer is registered"""
+
+    def __init__(self, source: str, descriptor: str, filters: t.Sequence[str]) -> None:
+        """Initialize the event instance.
+
+        :param source: A unique identifier for the publisher of the event
+        :param descriptor: Descriptor of the comm channel exposed by the consumer
+        :param filters: Collection of filters indicating messages of interest
+        """
+        super().__init__(self.CONSUMER_CREATED, source)
+        self.descriptor = descriptor
+        self.filters = list(filters)
+
+    def __str__(self) -> str:
+        """Convert the event to a string.
+
+        :returns: A string representation of this instance
+        """
+        _filters = ",".join(self.filters)
+        return f"{str(super())}|{self.descriptor}|{_filters}"
+
+
+class OnRemoveConsumer(EventBase):
+    """Publish this event when a consumer is shutting down and
+    should be removed from notification lists."""
+
+    descriptor: str
+    """Descriptor of the comm channel exposed by the consumer"""
+
+    CONSUMER_REMOVED: t.ClassVar[str] = "consumer-removed"
+    """Unique category name for an event raised when a new consumer is unregistered"""
+
+    def __init__(self, source: str, descriptor: str) -> None:
+        """Initialize the OnRemoveConsumer event.
+
+        :param source: A unique identifier for the publisher of the event
+        :param descriptor: Descriptor of the comm channel exposed by the consumer
+        """
+        super().__init__(self.CONSUMER_REMOVED, source)
+        self.descriptor = descriptor
+
+    def __str__(self) -> str:
+        """Convert the event to a string.
+
+        :returns: A string representation of this instance
+        """
+        return f"{str(super())}|{self.descriptor}"
+
+
+class OnWriteFeatureStore(EventBase):
+    """Publish this event when a feature store key is written."""
+
+    descriptor: str
+    """The descriptor of the feature store where the write occurred"""
+    key: str
+    """The key identifying where the write occurred"""
+
+    FEATURE_STORE_WRITTEN: str = "feature-store-written"
+    """Event category for an event raised when a feature store key is written"""
+
+    def __init__(self, source: str, descriptor: str, key: str) -> None:
+        """Initialize the OnWriteFeatureStore event.
+
+        :param source: A unique identifier for the publisher of the event
+        :param descriptor: The descriptor of the feature store where the write occurred
+        :param key: The key identifying where the write occurred
+        """
+        super().__init__(self.FEATURE_STORE_WRITTEN, source)
+        self.descriptor = descriptor
+        self.key = key
+
+    def __str__(self) -> str:
+        """Convert the event to a string.
+
+        :returns: A string representation of this instance
+        """
+        return f"{str(super())}|{self.descriptor}|{self.key}"
diff --git a/smartsim/_core/mli/infrastructure/comm/producer.py b/smartsim/_core/mli/infrastructure/comm/producer.py
new file mode 100644
index 0000000000..2d8a7c14ad
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/comm/producer.py
@@ -0,0 +1,44 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+from smartsim._core.mli.infrastructure.comm.event import EventBase
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class EventProducer(t.Protocol):
+    """Core API of a class that publishes events."""
+
+    def send(self, event: EventBase, timeout: float = 0.001) -> int:
+        """Send an event using the configured comm channel.
+
+        :param event: The event to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :returns: The number of messages that were sent
+        """
diff --git a/smartsim/_core/mli/infrastructure/control/__init__.py b/smartsim/_core/mli/infrastructure/control/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/smartsim/_core/mli/infrastructure/control/device_manager.py b/smartsim/_core/mli/infrastructure/control/device_manager.py
new file mode 100644
index 0000000000..9334971f8c
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/device_manager.py
@@ -0,0 +1,166 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+from contextlib import _GeneratorContextManager, contextmanager
+
+from .....log import get_logger
+from ..storage.feature_store import FeatureStore
+from ..worker.worker import MachineLearningWorkerBase, RequestBatch
+
+logger = get_logger(__name__)
+
+
+class WorkerDevice:
+    def __init__(self, name: str) -> None:
+        """Wrapper around a device to keep track of loaded Models and availability.
+
+        :param name: Name used by the toolkit to identify this device, e.g. ``cuda:0``
+        """
+        self._name = name
+        """The name used by the toolkit to identify this device"""
+        self._models: dict[str, t.Any] = {}
+        """Dict of keys to models which are loaded on this device"""
+
+    @property
+    def name(self) -> str:
+        """The identifier of the device represented by this object
+
+        :returns: Name used by the toolkit to identify this device
+        """
+        return self._name
+
+    def add_model(self, key: str, model: t.Any) -> None:
+        """Add a reference to a model loaded on this device and assign it a key.
+
+        :param key: The key under which the model is saved
+        :param model: The model which is added
+        """
+        self._models[key] = model
+
+    def remove_model(self, key: str) -> None:
+        """Remove the reference to a model loaded on this device.
+
+        :param key: The key of the model to remove
+        :raises KeyError: If key does not exist for removal
+        """
+        try:
+            self._models.pop(key)
+        except KeyError:
+            logger.warning(f"An unknown key was requested for removal: {key}")
+            raise
+
+    def get_model(self, key: str) -> t.Any:
+        """Get the model corresponding to a given key.
+
+        :param key: The model key
+        :returns: The model for the given key
+        :raises KeyError: If key does not exist
+        """
+        try:
+            return self._models[key]
+        except KeyError:
+            logger.warning(f"An unknown key was requested: {key}")
+            raise
+
+    def __contains__(self, key: str) -> bool:
+        """Check if model with a given key is available on the device.
+
+        :param key: The key of the model to check for existence
+        :returns: Whether the model is available on the device
+        """
+        return key in self._models
+
+    @contextmanager
+    def get(self, key_to_remove: t.Optional[str]) -> t.Iterator["WorkerDevice"]:
+        """Get the WorkerDevice generator and optionally remove a model.
+
+        :param key_to_remove: The key of the model to optionally remove
+        :returns: WorkerDevice generator
+        """
+        yield self
+        if key_to_remove is not None:
+            self.remove_model(key_to_remove)
+
+
+class DeviceManager:
+    def __init__(self, device: WorkerDevice):
+        """An object to manage devices such as GPUs and CPUs.
+
+        The main goal of the ``DeviceManager`` is to ensure that
+        the managed device is ready to be used by a worker to
+        run a given model.
+
+        :param device: The managed device
+        """
+        self._device = device
+        """Device managed by this object"""
+
+    def _load_model_on_device(
+        self,
+        worker: MachineLearningWorkerBase,
+        batch: RequestBatch,
+        feature_stores: dict[str, FeatureStore],
+    ) -> None:
+        """Load the model needed to execute a batch on the managed device.
+
+        The model is loaded by the worker.
+
+        :param worker: The worker that loads the model
+        :param batch: The batch for which the model is needed
+        :param feature_stores: Feature stores where the model could be stored
+        """
+
+        model_bytes = worker.fetch_model(batch, feature_stores)
+        loaded_model = worker.load_model(batch, model_bytes, self._device.name)
+        self._device.add_model(batch.model_id.key, loaded_model.model)
+
+    def get_device(
+        self,
+        worker: MachineLearningWorkerBase,
+        batch: RequestBatch,
+        feature_stores: dict[str, FeatureStore],
+    ) -> _GeneratorContextManager[WorkerDevice]:
+        """Get the device managed by this object.
+
+        The model needed to run the batch of requests is
+        guaranteed to be available on the device.
+
+        :param worker: The worker that wants to access the device
+        :param batch: The batch of requests
+        :param feature_store: The feature store on which part of the
+        data needed by the request may be stored
+        :returns: A generator yielding the device
+        """
+        model_in_request = batch.has_raw_model
+
+        # Load model if not already loaded, or
+        # because it is sent with the request
+        if model_in_request or not batch.model_id.key in self._device:
+            self._load_model_on_device(worker, batch, feature_stores)
+
+        key_to_remove = batch.model_id.key if model_in_request else None
+        return self._device.get(key_to_remove)
diff --git a/smartsim/_core/mli/infrastructure/control/dragon_util.py b/smartsim/_core/mli/infrastructure/control/dragon_util.py
new file mode 100644
index 0000000000..95c3e60524
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/dragon_util.py
@@ -0,0 +1,79 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import annotations
+
+import os
+import socket
+import typing as t
+
+import pytest
+from smartsim.log import get_logger
+
+dragon = pytest.importorskip("dragon")
+
+# isort: off
+
+import dragon.infrastructure.policy as dragon_policy
+import dragon.infrastructure.process_desc as dragon_process_desc
+import dragon.native.process as dragon_process
+
+# isort: on
+
+
+logger = get_logger(__name__)
+
+
+def function_as_dragon_proc(
+    entrypoint_fn: t.Callable[[t.Any], None],
+    args: t.List[t.Any],
+    cpu_affinity: t.List[int],
+    gpu_affinity: t.List[int],
+) -> dragon_process.Process:
+    """Execute a function as an independent dragon process.
+
+    :param entrypoint_fn: The function to execute
+    :param args: The arguments for the entrypoint function
+    :param cpu_affinity: The cpu affinity for the process
+    :param gpu_affinity: The gpu affinity for the process
+    :returns: The dragon process handle
+    """
+    options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
+    local_policy = dragon_policy.Policy(
+        placement=dragon_policy.Policy.Placement.HOST_NAME,
+        host_name=socket.gethostname(),
+        cpu_affinity=cpu_affinity,
+        gpu_affinity=gpu_affinity,
+    )
+    return dragon_process.Process(
+        target=entrypoint_fn,
+        args=args,
+        cwd=os.getcwd(),
+        policy=local_policy,
+        options=options,
+        stderr=dragon_process.Popen.STDOUT,
+        stdout=dragon_process.Popen.STDOUT,
+    )
diff --git a/smartsim/_core/mli/infrastructure/control/error_handling.py b/smartsim/_core/mli/infrastructure/control/error_handling.py
new file mode 100644
index 0000000000..a75f533a37
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/error_handling.py
@@ -0,0 +1,78 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+from .....log import get_logger
+from ...comm.channel.channel import CommChannelBase
+from ...message_handler import MessageHandler
+from ...mli_schemas.response.response_capnp import ResponseBuilder
+
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+
+logger = get_logger(__file__)
+
+
+def build_failure_reply(status: "Status", message: str) -> ResponseBuilder:
+    """
+    Builds a failure response message.
+
+    :param status: Status enum
+    :param message: Status message
+    :returns: Failure response
+    """
+    return MessageHandler.build_response(
+        status=status,
+        message=message,
+        result=None,
+        custom_attributes=None,
+    )
+
+
+def exception_handler(
+    exc: Exception,
+    reply_channel: t.Optional[CommChannelBase],
+    failure_message: t.Optional[str],
+) -> None:
+    """
+    Logs exceptions and sends a failure response.
+
+    :param exc: The exception to be logged
+    :param reply_channel: The channel used to send replies
+    :param failure_message: Failure message to log and send back
+    """
+    logger.exception(exc)
+    if reply_channel:
+        if failure_message is None:
+            failure_message = str(exc)
+
+        serialized_resp = MessageHandler.serialize_response(
+            build_failure_reply("fail", failure_message)
+        )
+        reply_channel.send(serialized_resp)
+    else:
+        logger.warning("Unable to notify client of error without a reply channel")
diff --git a/smartsim/_core/mli/infrastructure/control/listener.py b/smartsim/_core/mli/infrastructure/control/listener.py
new file mode 100644
index 0000000000..56a7b12d34
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/listener.py
@@ -0,0 +1,352 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# isort: off
+# pylint: disable=import-error
+# pylint: disable=unused-import
+import socket
+import dragon
+
+# pylint: enable=unused-import
+# pylint: enable=import-error
+# isort: on
+
+import argparse
+import multiprocessing as mp
+import os
+import sys
+import typing as t
+
+from smartsim._core.entrypoints.service import Service
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer
+from smartsim._core.mli.infrastructure.comm.event import (
+    EventBase,
+    OnCreateConsumer,
+    OnRemoveConsumer,
+    OnShutdownRequested,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class ConsumerRegistrationListener(Service):
+    """A long-running service that manages the list of consumers receiving
+    events that are broadcast. It hosts handlers for adding and removing consumers
+    """
+
+    def __init__(
+        self,
+        backbone: BackboneFeatureStore,
+        timeout: float,
+        batch_timeout: float,
+        as_service: bool = False,
+        cooldown: int = 0,
+        health_check_frequency: float = 60.0,
+    ) -> None:
+        """Initialize the EventListener.
+
+        :param backbone: The backbone feature store
+        :param timeout: Maximum time (in seconds) to allow a single recv request to wait
+        :param batch_timeout: Maximum time (in seconds) to allow a batch of receives to
+         continue to build
+        :param as_service: Specifies run-once or run-until-complete behavior of service
+        :param cooldown: Number of seconds to wait before shutting down after
+        shutdown criteria are met
+        """
+        super().__init__(
+            as_service, cooldown, health_check_frequency=health_check_frequency
+        )
+        self._timeout = timeout
+        """ Maximum time (in seconds) to allow a single recv request to wait"""
+        self._batch_timeout = batch_timeout
+        """Maximum time (in seconds) to allow a batch of receives to
+         continue to build"""
+        self._consumer: t.Optional[EventConsumer] = None
+        """The event consumer that handles receiving events"""
+        self._backbone = backbone
+        """A standalone, system-created feature store used to share internal
+        information among MLI components"""
+
+    def _on_start(self) -> None:
+        """Called on initial entry into Service `execute` event loop before
+        `_on_iteration` is invoked."""
+        super()._on_start()
+        self._create_eventing()
+
+    def _on_shutdown(self) -> None:
+        """Release dragon resources. Called immediately after exiting
+        the main event loop during automatic shutdown."""
+        super()._on_shutdown()
+
+        if not self._consumer:
+            return
+
+        # remove descriptor for this listener from the backbone if it's there
+        if registered_consumer := self._backbone.backend_channel:
+            # if there is a descriptor in the backbone and it's still this listener
+            if registered_consumer == self._consumer.descriptor:
+                logger.info(
+                    f"Listener clearing backend consumer {self._consumer.name} "
+                    "from backbone"
+                )
+
+                # unregister this listener in the backbone
+                self._backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
+
+        # TODO: need the channel to be cleaned up
+        # self._consumer._comm_channel._channel.destroy()
+
+    def _on_iteration(self) -> None:
+        """Executes calls to the machine learning worker implementation to complete
+        the inference pipeline."""
+
+        if self._consumer is None:
+            logger.info("Unable to listen. No consumer available.")
+            return
+
+        self._consumer.listen_once(self._timeout, self._batch_timeout)
+
+    def _can_shutdown(self) -> bool:
+        """Determines if the event consumer is ready to stop listening.
+
+        :returns: True when criteria to shutdown the service are met, False otherwise
+        """
+
+        if self._backbone is None:
+            logger.info("Listener must shutdown. No backbone attached")
+            return True
+
+        if self._consumer is None:
+            logger.info("Listener must shutdown. No consumer channel created")
+            return True
+
+        if not self._consumer.listening:
+            logger.info(
+                f"Listener can shutdown. Consumer `{self._consumer.name}` "
+                "is not listening"
+            )
+            return True
+
+        return False
+
+    def _on_unregister(self, event: OnRemoveConsumer) -> None:
+        """Event handler for updating the backbone when event consumers
+        are un-registered.
+
+        :param event: The event that was received
+        """
+        notify_list = set(self._backbone.notification_channels)
+
+        # remove the descriptor specified in the event
+        if event.descriptor in notify_list:
+            logger.debug(f"Removing notify consumer: {event.descriptor}")
+            notify_list.remove(event.descriptor)
+
+        # push the updated list back into the backbone
+        self._backbone.notification_channels = list(notify_list)
+
+    def _on_register(self, event: OnCreateConsumer) -> None:
+        """Event handler for updating the backbone when new event consumers
+        are registered.
+
+        :param event: The event that was received
+        """
+        notify_list = set(self._backbone.notification_channels)
+        logger.debug(f"Adding notify consumer: {event.descriptor}")
+        notify_list.add(event.descriptor)
+        self._backbone.notification_channels = list(notify_list)
+
+    def _on_event_received(self, event: EventBase) -> None:
+        """Primary event handler for the listener. Distributes events to
+        type-specific handlers.
+
+        :param event: The event that was received
+        """
+        if self._backbone is None:
+            logger.info("Unable to handle event. Backbone is missing.")
+
+        if isinstance(event, OnCreateConsumer):
+            self._on_register(event)
+        elif isinstance(event, OnRemoveConsumer):
+            self._on_unregister(event)
+        else:
+            logger.info(
+                "Consumer registration listener received an "
+                f"unexpected event: {event=}"
+            )
+
+    def _on_health_check(self) -> None:
+        """Check if this consumer has been replaced by a new listener
+        and automatically trigger a shutdown. Invoked based on the
+        value of `self._health_check_frequency`."""
+        super()._on_health_check()
+
+        try:
+            logger.debug("Retrieving registered listener descriptor")
+            descriptor = self._backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
+        except KeyError:
+            descriptor = None
+            if self._consumer:
+                self._consumer.listening = False
+
+        if self._consumer and descriptor != self._consumer.descriptor:
+            logger.warning(
+                f"Consumer `{self._consumer.name}` for `ConsumerRegistrationListener` "
+                "is no longer registered. It will automatically shut down."
+            )
+            self._consumer.listening = False
+
+    def _publish_consumer(self) -> None:
+        """Publish the registrar consumer descriptor to the backbone."""
+        if self._consumer is None:
+            logger.warning("No registrar consumer descriptor available to publisher")
+            return
+
+        logger.debug(f"Publishing {self._consumer.descriptor} to backbone")
+        self._backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = (
+            self._consumer.descriptor
+        )
+
+    def _create_eventing(self) -> EventConsumer:
+        """
+        Create an event publisher and event consumer for communicating with
+        other MLI resources.
+
+        NOTE: the backbone must be initialized before connecting eventing clients.
+
+        :returns: The newly created EventConsumer instance
+        :raises SmartSimError: If a listener channel cannot be created
+        """
+
+        if self._consumer:
+            return self._consumer
+
+        logger.info("Creating event consumer")
+
+        dragon_channel = create_local(500)
+        event_channel = DragonCommChannel(dragon_channel)
+
+        if not event_channel.descriptor:
+            raise SmartSimError(
+                "Unable to generate the descriptor for the event channel"
+            )
+
+        self._consumer = EventConsumer(
+            event_channel,
+            self._backbone,
+            [
+                OnCreateConsumer.CONSUMER_CREATED,
+                OnRemoveConsumer.CONSUMER_REMOVED,
+                OnShutdownRequested.SHUTDOWN,
+            ],
+            name=f"ConsumerRegistrar.{socket.gethostname()}",
+            event_handler=self._on_event_received,
+        )
+        self._publish_consumer()
+
+        logger.info(
+            f"Backend consumer `{self._consumer.name}` created: "
+            f"{self._consumer.descriptor}"
+        )
+
+        return self._consumer
+
+
+def _create_parser() -> argparse.ArgumentParser:
+    """
+    Create an argument parser that contains the arguments
+    required to start the listener as a new process:
+
+      --timeout
+      --batch_timeout
+
+    :returns: A configured parser
+    """
+    arg_parser = argparse.ArgumentParser(prog="ConsumerRegistrarEventListener")
+
+    arg_parser.add_argument("--timeout", type=float, default=1.0)
+    arg_parser.add_argument("--batch_timeout", type=float, default=1.0)
+
+    return arg_parser
+
+
+def _connect_backbone() -> t.Optional[BackboneFeatureStore]:
+    """
+    Load the backbone by retrieving the descriptor from environment variables.
+
+    :returns: The backbone feature store
+    :raises SmartSimError: if a descriptor is not found
+    """
+    descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, "")
+
+    if not descriptor:
+        return None
+
+    logger.info(f"Listener backbone descriptor: {descriptor}\n")
+
+    # `from_writable_descriptor` ensures we can update the backbone
+    return BackboneFeatureStore.from_writable_descriptor(descriptor)
+
+
+if __name__ == "__main__":
+    mp.set_start_method("dragon")
+
+    parser = _create_parser()
+    args = parser.parse_args()
+
+    backbone_fs = _connect_backbone()
+
+    if backbone_fs is None:
+        logger.error(
+            "Unable to attach to the backbone without the "
+            f"`{BackboneFeatureStore.MLI_BACKBONE}` environment variable."
+        )
+        sys.exit(1)
+
+    logger.debug(f"Listener attached to backbone: {backbone_fs.descriptor}")
+
+    listener = ConsumerRegistrationListener(
+        backbone_fs,
+        float(args.timeout),
+        float(args.batch_timeout),
+        as_service=True,
+    )
+
+    logger.info(f"listener created? {listener}")
+
+    try:
+        listener.execute()
+        sys.exit(0)
+    except Exception:
+        logger.exception("An error occurred in the event listener")
+        sys.exit(1)
diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
new file mode 100644
index 0000000000..e22a2c8f62
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
@@ -0,0 +1,559 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# pylint: disable=import-error
+# pylint: disable-next=unused-import
+import dragon
+import dragon.globalservices.pool as dragon_gs_pool
+from dragon.managed_memory import MemoryPool
+from dragon.mpbridge.queues import DragonQueue
+
+# pylint: enable=import-error
+
+# isort: off
+# isort: on
+
+import multiprocessing as mp
+import time
+import typing as t
+import uuid
+from queue import Empty, Full, Queue
+
+from smartsim._core.entrypoints.service import Service
+
+from .....error import SmartSimError
+from .....log import get_logger
+from ....utils.timings import PerfTimer
+from ..environment_loader import EnvironmentConfigLoader
+from ..storage.feature_store import FeatureStore
+from ..worker.worker import (
+    InferenceRequest,
+    MachineLearningWorkerBase,
+    ModelIdentifier,
+    RequestBatch,
+)
+from .error_handling import exception_handler
+
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+
+logger = get_logger("Request Dispatcher")
+
+
+class BatchQueue(Queue[InferenceRequest]):
+    def __init__(
+        self, batch_timeout: float, batch_size: int, model_id: ModelIdentifier
+    ) -> None:
+        """Queue used to store inference requests waiting to be batched and
+        sent to Worker Managers.
+
+        :param batch_timeout: Time in seconds that has to be waited before flushing a
+        non-full queue. The time of the first item put is 0 seconds.
+        :param batch_size: Total capacity of the queue
+        :param model_id: Key of the model which needs to be executed on the queued
+        requests
+        """
+        super().__init__(maxsize=batch_size)
+        self._batch_timeout = batch_timeout
+        """Time in seconds that has to be waited before flushing a non-full queue.
+        The time of the first item put is 0 seconds."""
+        self._batch_size = batch_size
+        """Total capacity of the queue"""
+        self._first_put: t.Optional[float] = None
+        """Time at which the first item was put on the queue"""
+        self._disposable = False
+        """Whether the queue will not be used again and can be deleted.
+        A disposable queue is always full."""
+        self._model_id: ModelIdentifier = model_id
+        """Key of the model which needs to be executed on the queued requests"""
+        self._uid = str(uuid.uuid4())
+        """Unique ID of queue"""
+
+    @property
+    def uid(self) -> str:
+        """ID of this queue.
+
+        :returns: Queue ID
+        """
+        return self._uid
+
+    @property
+    def model_id(self) -> ModelIdentifier:
+        """Key of the model which needs to be run on the queued requests.
+
+        :returns: Model key
+        """
+        return self._model_id
+
+    def put(
+        self,
+        item: InferenceRequest,
+        block: bool = False,
+        timeout: t.Optional[float] = 0.0,
+    ) -> None:
+        """Put an inference request in the queue.
+
+        :param item: The request
+        :param block: Whether to block when trying to put the item
+        :param timeout: Time (in seconds) to wait if block==True
+        :raises Full: If an item cannot be put on the queue
+        """
+        super().put(item, block=block, timeout=timeout)
+        if self._first_put is None:
+            self._first_put = time.time()
+
+    @property
+    def _elapsed_time(self) -> float:
+        """Time elapsed since the first item was put on this queue.
+
+        :returns: Time elapsed
+        """
+        if self.empty() or self._first_put is None:
+            return 0
+        return time.time() - self._first_put
+
+    @property
+    def ready(self) -> bool:
+        """Check if the queue can be flushed.
+
+        :returns: True if the queue can be flushed, False otherwise
+        """
+        if self.empty():
+            logger.debug("Request dispatcher queue is empty")
+            return False
+
+        timed_out = False
+        if self._batch_timeout >= 0:
+            timed_out = self._elapsed_time >= self._batch_timeout
+
+        if self.full():
+            logger.debug("Request dispatcher ready to deliver full batch")
+            return True
+
+        if timed_out:
+            logger.debug("Request dispatcher delivering partial batch")
+            return True
+
+        return False
+
+    def make_disposable(self) -> None:
+        """Set this queue as disposable, and never use it again after it gets
+        flushed."""
+        self._disposable = True
+
+    @property
+    def can_be_removed(self) -> bool:
+        """Determine whether this queue can be deleted and garbage collected.
+
+        :returns: True if queue can be removed, False otherwise
+        """
+        return self.empty() and self._disposable
+
+    def flush(self) -> list[t.Any]:
+        """Get all requests from queue.
+
+        :returns: Requests waiting to be executed
+        """
+        num_items = self.qsize()
+        self._first_put = None
+        items = []
+        for _ in range(num_items):
+            try:
+                items.append(self.get())
+            except Empty:
+                break
+
+        return items
+
+    def full(self) -> bool:
+        """Check if the queue has reached its maximum capacity.
+
+        :returns: True if the queue has reached its maximum capacity,
+        False otherwise
+        """
+        if self._disposable:
+            return True
+        return self.qsize() >= self._batch_size
+
+    def empty(self) -> bool:
+        """Check if the queue is empty.
+
+        :returns: True if the queue has 0 elements, False otherwise
+        """
+        return self.qsize() == 0
+
+
+class RequestDispatcher(Service):
+    def __init__(
+        self,
+        batch_timeout: float,
+        batch_size: int,
+        config_loader: EnvironmentConfigLoader,
+        worker_type: t.Type[MachineLearningWorkerBase],
+        mem_pool_size: int = 2 * 1024**3,
+    ) -> None:
+        """The RequestDispatcher intercepts inference requests, stages them in
+        queues and batches them together before making them available to Worker
+        Managers.
+
+        :param batch_timeout: Maximum elapsed time before flushing a complete or
+        incomplete batch
+        :param batch_size: Total capacity of each batch queue
+        :param mem_pool: Memory pool used to share batched input tensors with worker
+        managers
+        :param config_loader: Object to load configuration from environment
+        :param worker_type: Type of worker to instantiate to batch inputs
+        :param mem_pool_size: Size of the memory pool used to allocate tensors
+        """
+        super().__init__(as_service=True, cooldown=1)
+        self._queues: dict[str, list[BatchQueue]] = {}
+        """Dict of all batch queues available for a given model id"""
+        self._active_queues: dict[str, BatchQueue] = {}
+        """Mapping telling which queue is the recipient of requests for a given model
+        key"""
+        self._batch_timeout = batch_timeout
+        """Time in seconds that has to be waited before flushing a non-full queue"""
+        self._batch_size = batch_size
+        """Total capacity of each batch queue"""
+        incoming_channel = config_loader.get_queue()
+        if incoming_channel is None:
+            raise SmartSimError("No incoming channel for dispatcher")
+        self._incoming_channel = incoming_channel
+        """The channel the dispatcher monitors for new tasks"""
+        self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0)
+        """The queue on which batched inference requests are placed"""
+        self._feature_stores: t.Dict[str, FeatureStore] = {}
+        """A collection of attached feature stores"""
+        self._featurestore_factory = config_loader._featurestore_factory
+        """A factory method to create a desired feature store client type"""
+        self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone()
+        """A standalone, system-created feature store used to share internal
+        information among MLI components"""
+        self._callback_factory = config_loader._callback_factory
+        """The type of communication channel to construct for callbacks"""
+        self._worker = worker_type()
+        """The worker used to batch inputs"""
+        self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(mem_pool_size).sdesc)
+        """Memory pool used to share batched input tensors with the Worker Managers"""
+        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True)
+        """Performance timer"""
+
+    @property
+    def has_featurestore_factory(self) -> bool:
+        """Check if the RequestDispatcher has a FeatureStore factory.
+
+        :returns: True if there is a FeatureStore factory, False otherwise
+        """
+        return self._featurestore_factory is not None
+
+    def _check_feature_stores(self, request: InferenceRequest) -> bool:
+        """Ensures that all feature stores required by the request are available.
+
+        :param request: The request to validate
+        :returns: False if feature store validation fails for the request, True
+        otherwise
+        """
+        # collect all feature stores required by the request
+        fs_model: t.Set[str] = set()
+        if request.model_key:
+            fs_model = {request.model_key.descriptor}
+        fs_inputs = {key.descriptor for key in request.input_keys}
+        fs_outputs = {key.descriptor for key in request.output_keys}
+
+        # identify which feature stores are requested and unknown
+        fs_desired = fs_model.union(fs_inputs).union(fs_outputs)
+        fs_actual = {item.descriptor for item in self._feature_stores.values()}
+        fs_missing = fs_desired - fs_actual
+
+        if not self.has_featurestore_factory:
+            logger.error("No feature store factory is configured. Unable to dispatch.")
+            return False
+
+        # create the feature stores we need to service request
+        if fs_missing:
+            logger.debug(f"Adding feature store(s): {fs_missing}")
+            for descriptor in fs_missing:
+                feature_store = self._featurestore_factory(descriptor)
+                self._feature_stores[descriptor] = feature_store
+
+        return True
+
+    # pylint: disable-next=no-self-use
+    def _check_model(self, request: InferenceRequest) -> bool:
+        """Ensure that a model is available for the request.
+
+        :param request: The request to validate
+        :returns: False if model validation fails for the request, True otherwise
+        """
+        if request.has_model_key or request.has_raw_model:
+            return True
+
+        logger.error("Unable to continue without model bytes or feature store key")
+        return False
+
+    # pylint: disable-next=no-self-use
+    def _check_inputs(self, request: InferenceRequest) -> bool:
+        """Ensure that inputs are available for the request.
+
+        :param request: The request to validate
+        :returns: False if input validation fails for the request, True otherwise
+        """
+        if request.has_input_keys or request.has_raw_inputs:
+            return True
+
+        logger.error("Unable to continue without input bytes or feature store keys")
+        return False
+
+    # pylint: disable-next=no-self-use
+    def _check_callback(self, request: InferenceRequest) -> bool:
+        """Ensure that a callback channel is available for the request.
+
+        :param request: The request to validate
+        :returns: False if callback validation fails for the request, True otherwise
+        """
+        if request.callback:
+            return True
+
+        logger.error("No callback channel provided in request")
+        return False
+
+    def _validate_request(self, request: InferenceRequest) -> bool:
+        """Ensure the request can be processed.
+
+        :param request: The request to validate
+        :returns: False if the request fails any validation checks, True otherwise
+        """
+        checks = [
+            self._check_feature_stores(request),
+            self._check_model(request),
+            self._check_inputs(request),
+            self._check_callback(request),
+        ]
+
+        return all(checks)
+
+    def _on_iteration(self) -> None:
+        """This method is executed repeatedly until ``Service`` shutdown
+        conditions are satisfied and cooldown is elapsed."""
+        try:
+            self._perf_timer.is_active = True
+            bytes_list: t.List[bytes] = self._incoming_channel.recv()
+        except Exception:
+            self._perf_timer.is_active = False
+        else:
+            if not bytes_list:
+                exception_handler(
+                    ValueError("No request data found"),
+                    None,
+                    None,
+                )
+
+            logger.debug(f"Dispatcher is processing {len(bytes_list)} messages")
+            request_bytes = bytes_list[0]
+            tensor_bytes_list = bytes_list[1:]
+            self._perf_timer.start_timings()
+
+            request = self._worker.deserialize_message(
+                request_bytes, self._callback_factory
+            )
+            if request.has_input_meta and tensor_bytes_list:
+                request.raw_inputs = tensor_bytes_list
+
+            self._perf_timer.measure_time("deserialize_message")
+
+            if not self._validate_request(request):
+                exception_handler(
+                    ValueError("Error validating the request"),
+                    request.callback,
+                    None,
+                )
+                self._perf_timer.measure_time("validate_request")
+            else:
+                self._perf_timer.measure_time("validate_request")
+                self.dispatch(request)
+                self._perf_timer.measure_time("dispatch")
+        finally:
+            self.flush_requests()
+            self.remove_queues()
+
+            self._perf_timer.end_timings()
+
+        if self._perf_timer.max_length == 801 and self._perf_timer.is_active:
+            self._perf_timer.print_timings(True)
+
+    def remove_queues(self) -> None:
+        """Remove references to queues that can be removed
+        and allow them to be garbage collected."""
+        queue_lists_to_remove = []
+        for key, queues in self._queues.items():
+            queues_to_remove = []
+            for queue in queues:
+                if queue.can_be_removed:
+                    queues_to_remove.append(queue)
+
+            for queue_to_remove in queues_to_remove:
+                queues.remove(queue_to_remove)
+                if (
+                    key in self._active_queues
+                    and self._active_queues[key] == queue_to_remove
+                ):
+                    del self._active_queues[key]
+
+            if len(queues) == 0:
+                queue_lists_to_remove.append(key)
+
+        for key in queue_lists_to_remove:
+            del self._queues[key]
+
+    @property
+    def task_queue(self) -> DragonQueue:
+        """The queue on which batched requests are placed.
+
+        :returns: The queue
+        """
+        return self._outgoing_queue
+
+    def _swap_queue(self, model_id: ModelIdentifier) -> None:
+        """Get an empty queue or create a new one
+        and make it the active one for a given model.
+
+        :param model_id: The id of the model for which the
+        queue has to be swapped
+        """
+        if model_id.key in self._queues:
+            for queue in self._queues[model_id.key]:
+                if not queue.full():
+                    self._active_queues[model_id.key] = queue
+                    return
+
+        new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_id)
+        if model_id.key in self._queues:
+            self._queues[model_id.key].append(new_queue)
+        else:
+            self._queues[model_id.key] = [new_queue]
+        self._active_queues[model_id.key] = new_queue
+        return
+
+    def dispatch(self, request: InferenceRequest) -> None:
+        """Assign a request to a batch queue.
+
+        :param request: The request to place
+        """
+        if request.has_raw_model:
+            logger.debug("Direct inference requested, creating tmp queue")
+            tmp_id = f"_tmp_{str(uuid.uuid4())}"
+            tmp_queue: BatchQueue = BatchQueue(
+                batch_timeout=0,
+                batch_size=1,
+                model_id=ModelIdentifier(key=tmp_id, descriptor="TMP"),
+            )
+            self._active_queues[tmp_id] = tmp_queue
+            self._queues[tmp_id] = [tmp_queue]
+            tmp_queue.put(request)
+            tmp_queue.make_disposable()
+            return
+
+        if request.model_key:
+            success = False
+            while not success:
+                try:
+                    self._active_queues[request.model_key.key].put_nowait(request)
+                    success = True
+                except (Full, KeyError):
+                    self._swap_queue(request.model_key)
+
+    def flush_requests(self) -> None:
+        """Get all requests from queues which are ready to be flushed. Place all
+        available request batches in the outgoing queue."""
+        for queue_list in self._queues.values():
+            for queue in queue_list:
+                if queue.ready:
+                    self._perf_timer.measure_time("find_queue")
+                    try:
+                        batch = RequestBatch(
+                            requests=queue.flush(),
+                            inputs=None,
+                            model_id=queue.model_id,
+                        )
+                    finally:
+                        self._perf_timer.measure_time("flush_requests")
+                    try:
+                        fetch_results = self._worker.fetch_inputs(
+                            batch=batch, feature_stores=self._feature_stores
+                        )
+                    except Exception as exc:
+                        exception_handler(
+                            exc,
+                            None,
+                            "Error fetching input.",
+                        )
+                        continue
+                    self._perf_timer.measure_time("fetch_input")
+                    try:
+                        transformed_inputs = self._worker.transform_input(
+                            batch=batch,
+                            fetch_results=fetch_results,
+                            mem_pool=self._mem_pool,
+                        )
+                    except Exception as exc:
+                        exception_handler(
+                            exc,
+                            None,
+                            "Error transforming input.",
+                        )
+                        continue
+
+                    self._perf_timer.measure_time("transform_input")
+                    batch.inputs = transformed_inputs
+                    for request in batch.requests:
+                        request.raw_inputs = []
+                        request.input_meta = []
+
+                    try:
+                        self._outgoing_queue.put(batch)
+                    except Exception as exc:
+                        exception_handler(
+                            exc,
+                            None,
+                            "Error placing batch on task queue.",
+                        )
+                        continue
+                    self._perf_timer.measure_time("put")
+
+    def _can_shutdown(self) -> bool:
+        """Determine whether the Service can be shut down.
+
+        :returns: False
+        """
+        return False
+
+    def __del__(self) -> None:
+        """Destroy allocated memory resources."""
+        # pool may be null if a failure occurs prior to successful attach
+        pool: t.Optional[MemoryPool] = getattr(self, "_mem_pool", None)
+
+        if pool:
+            pool.destroy()
diff --git a/smartsim/_core/mli/infrastructure/control/worker_manager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py
new file mode 100644
index 0000000000..bf6fddb81d
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py
@@ -0,0 +1,330 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# pylint: disable=import-error
+# pylint: disable-next=unused-import
+import dragon
+
+# pylint: enable=import-error
+
+# isort: off
+# isort: on
+
+import multiprocessing as mp
+import time
+import typing as t
+from queue import Empty
+
+from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore
+
+from .....log import get_logger
+from ....entrypoints.service import Service
+from ....utils.timings import PerfTimer
+from ...message_handler import MessageHandler
+from ..environment_loader import EnvironmentConfigLoader
+from ..worker.worker import (
+    InferenceReply,
+    LoadModelResult,
+    MachineLearningWorkerBase,
+    RequestBatch,
+)
+from .device_manager import DeviceManager, WorkerDevice
+from .error_handling import build_failure_reply, exception_handler
+
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+
+logger = get_logger(__name__)
+
+
+class WorkerManager(Service):
+    """An implementation of a service managing distribution of tasks to
+    machine learning workers."""
+
+    def __init__(
+        self,
+        config_loader: EnvironmentConfigLoader,
+        worker_type: t.Type[MachineLearningWorkerBase],
+        dispatcher_queue: "mp.Queue[RequestBatch]",
+        as_service: bool = False,
+        cooldown: int = 0,
+        device: t.Literal["cpu", "gpu"] = "cpu",
+    ) -> None:
+        """Initialize the WorkerManager.
+
+        :param config_loader: Environment config loader for loading queues
+        and feature stores
+        :param worker_type: The type of worker to manage
+        :param dispatcher_queue: Queue from which the batched requests are pulled
+        :param as_service: Specifies run-once or run-until-complete behavior of service
+        :param cooldown: Number of seconds to wait before shutting down after
+        shutdown criteria are met
+        :param device: The device on which the Worker should run. Every worker manager
+        is assigned one single GPU (if available), thus the device should have no index.
+        """
+        super().__init__(as_service, cooldown)
+
+        self._dispatcher_queue = dispatcher_queue
+        """The Dispatcher queue that the WorkerManager monitors for new batches"""
+        self._worker = worker_type()
+        """The ML Worker implementation"""
+        self._callback_factory = config_loader._callback_factory
+        """The type of communication channel to construct for callbacks"""
+        self._device = device
+        """Device on which workers need to run"""
+        self._cached_models: dict[str, t.Any] = {}
+        """Dictionary of previously loaded models"""
+        self._feature_stores: t.Dict[str, FeatureStore] = {}
+        """A collection of attached feature stores"""
+        self._featurestore_factory = config_loader._featurestore_factory
+        """A factory method to create a desired feature store client type"""
+        self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone()
+        """A standalone, system-created feature store used to share internal
+        information among MLI components"""
+        self._device_manager: t.Optional[DeviceManager] = None
+        """Object responsible for model caching and device access"""
+        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True)
+        """Performance timer"""
+
+    @property
+    def has_featurestore_factory(self) -> bool:
+        """Check if the WorkerManager has a FeatureStore factory.
+
+        :returns: True if there is a FeatureStore factory, False otherwise
+        """
+        return self._featurestore_factory is not None
+
+    def _on_start(self) -> None:
+        """Called on initial entry into Service `execute` event loop before
+        `_on_iteration` is invoked."""
+        self._device_manager = DeviceManager(WorkerDevice(self._device))
+
+    def _check_feature_stores(self, batch: RequestBatch) -> bool:
+        """Ensures that all feature stores required by the request are available.
+
+        :param batch: The batch of requests to validate
+        :returns: False if feature store validation fails for the batch, True otherwise
+        """
+        # collect all feature stores required by the request
+        fs_model: t.Set[str] = set()
+        if batch.model_id.key:
+            fs_model = {batch.model_id.descriptor}
+        fs_inputs = {key.descriptor for key in batch.input_keys}
+        fs_outputs = {key.descriptor for key in batch.output_keys}
+
+        # identify which feature stores are requested and unknown
+        fs_desired = fs_model.union(fs_inputs).union(fs_outputs)
+        fs_actual = {item.descriptor for item in self._feature_stores.values()}
+        fs_missing = fs_desired - fs_actual
+
+        if not self.has_featurestore_factory:
+            logger.error("No feature store factory configured")
+            return False
+
+        # create the feature stores we need to service request
+        if fs_missing:
+            logger.debug(f"Adding feature store(s): {fs_missing}")
+            for descriptor in fs_missing:
+                feature_store = self._featurestore_factory(descriptor)
+                self._feature_stores[descriptor] = feature_store
+
+        return True
+
+    def _validate_batch(self, batch: RequestBatch) -> bool:
+        """Ensure the request can be processed.
+
+        :param batch: The batch of requests to validate
+        :returns: False if the request fails any validation checks, True otherwise
+        """
+        if batch is None or not batch.has_valid_requests:
+            return False
+
+        return self._check_feature_stores(batch)
+
+    # remove this when we are done with time measurements
+    # pylint: disable-next=too-many-statements
+    def _on_iteration(self) -> None:
+        """Executes calls to the machine learning worker implementation to complete
+        the inference pipeline."""
+        pre_batch_time = time.perf_counter()
+        try:
+            batch: RequestBatch = self._dispatcher_queue.get(timeout=0.0001)
+        except Empty:
+            return
+
+        self._perf_timer.start_timings(
+            "flush_requests", time.perf_counter() - pre_batch_time
+        )
+
+        if not self._validate_batch(batch):
+            exception_handler(
+                ValueError("An invalid batch was received"),
+                None,
+                None,
+            )
+            return
+
+        if not self._device_manager:
+            for request in batch.requests:
+                msg = "No Device Manager found. WorkerManager._on_start() "
+                "must be called after initialization. If possible, "
+                "you should use `WorkerManager.execute()` instead of "
+                "directly calling `_on_iteration()`."
+                try:
+                    self._dispatcher_queue.put(batch)
+                except Exception:
+                    msg += "\nThe batch could not be put back in the queue "
+                    "and will not be processed."
+                exception_handler(
+                    RuntimeError(msg),
+                    request.callback,
+                    "Error acquiring device manager",
+                )
+            return
+
+        try:
+            device_cm = self._device_manager.get_device(
+                worker=self._worker,
+                batch=batch,
+                feature_stores=self._feature_stores,
+            )
+        except Exception as exc:
+            for request in batch.requests:
+                exception_handler(
+                    exc,
+                    request.callback,
+                    "Error loading model on device or getting device.",
+                )
+            return
+        self._perf_timer.measure_time("fetch_model")
+
+        with device_cm as device:
+
+            try:
+                model_result = LoadModelResult(device.get_model(batch.model_id.key))
+            except Exception as exc:
+                for request in batch.requests:
+                    exception_handler(
+                        exc, request.callback, "Error getting model from device."
+                    )
+                return
+            self._perf_timer.measure_time("load_model")
+
+            if not batch.inputs:
+                for request in batch.requests:
+                    exception_handler(
+                        ValueError("Error batching inputs"),
+                        request.callback,
+                        None,
+                    )
+                return
+            transformed_input = batch.inputs
+
+            try:
+                execute_result = self._worker.execute(
+                    batch, model_result, transformed_input, device.name
+                )
+            except Exception as e:
+                for request in batch.requests:
+                    exception_handler(e, request.callback, "Error while executing.")
+                return
+            self._perf_timer.measure_time("execute")
+
+            try:
+                transformed_outputs = self._worker.transform_output(
+                    batch, execute_result
+                )
+            except Exception as e:
+                for request in batch.requests:
+                    exception_handler(
+                        e, request.callback, "Error while transforming the output."
+                    )
+                return
+
+            for request, transformed_output in zip(batch.requests, transformed_outputs):
+                reply = InferenceReply()
+                if request.has_output_keys:
+                    try:
+                        reply.output_keys = self._worker.place_output(
+                            request,
+                            transformed_output,
+                            self._feature_stores,
+                        )
+                    except Exception as e:
+                        exception_handler(
+                            e, request.callback, "Error while placing the output."
+                        )
+                        continue
+                else:
+                    reply.outputs = transformed_output.outputs
+                self._perf_timer.measure_time("assign_output")
+
+                if not reply.has_outputs:
+                    response = build_failure_reply("fail", "Outputs not found.")
+                else:
+                    reply.status_enum = "complete"
+                    reply.message = "Success"
+
+                    results = self._worker.prepare_outputs(reply)
+                    response = MessageHandler.build_response(
+                        status=reply.status_enum,
+                        message=reply.message,
+                        result=results,
+                        custom_attributes=None,
+                    )
+
+                self._perf_timer.measure_time("build_reply")
+
+                serialized_resp = MessageHandler.serialize_response(response)
+
+                self._perf_timer.measure_time("serialize_resp")
+
+                if request.callback:
+                    request.callback.send(serialized_resp)
+                    if reply.has_outputs:
+                        # send tensor data after response
+                        for output in reply.outputs:
+                            request.callback.send(output)
+                self._perf_timer.measure_time("send")
+
+        self._perf_timer.end_timings()
+
+        if self._perf_timer.max_length == 801:
+            self._perf_timer.print_timings(True)
+
+    def _can_shutdown(self) -> bool:
+        """Determine if the service can be shutdown.
+
+        :returns: True when criteria to shutdown the service are met, False otherwise
+        """
+        # todo: determine shutdown criteria
+        # will we receive a completion message?
+        # will we let MLI mgr just kill this?
+        # time_diff = self._last_event - datetime.datetime.now()
+        # if time_diff.total_seconds() > self._cooldown:
+        #     return True
+        # return False
+        return self._worker is None
diff --git a/smartsim/_core/mli/infrastructure/environment_loader.py b/smartsim/_core/mli/infrastructure/environment_loader.py
new file mode 100644
index 0000000000..5ba0fccc27
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/environment_loader.py
@@ -0,0 +1,116 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import typing as t
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class EnvironmentConfigLoader:
+    """
+    Facilitates the loading of a FeatureStore and Queue into the WorkerManager.
+    """
+
+    REQUEST_QUEUE_ENV_VAR = "_SMARTSIM_REQUEST_QUEUE"
+    """The environment variable that holds the request queue descriptor"""
+    BACKBONE_ENV_VAR = "_SMARTSIM_INFRA_BACKBONE"
+    """The environment variable that holds the backbone descriptor"""
+
+    def __init__(
+        self,
+        featurestore_factory: t.Callable[[str], FeatureStore],
+        callback_factory: t.Callable[[str], CommChannelBase],
+        queue_factory: t.Callable[[str], CommChannelBase],
+    ) -> None:
+        """Initialize the config loader instance with the factories necessary for
+        creating additional objects.
+
+        :param featurestore_factory: A factory method that produces a feature store
+        given a descriptor
+        :param callback_factory: A factory method that produces a callback
+        channel given a descriptor
+        :param queue_factory: A factory method that produces a queue
+        channel given a descriptor
+        """
+        self.queue: t.Optional[CommChannelBase] = None
+        """The attached incoming event queue channel"""
+        self.backbone: t.Optional[FeatureStore] = None
+        """The attached backbone feature store"""
+        self._featurestore_factory = featurestore_factory
+        """A factory method to instantiate a FeatureStore"""
+        self._callback_factory = callback_factory
+        """A factory method to instantiate a concrete CommChannelBase
+        for inference callbacks"""
+        self._queue_factory = queue_factory
+        """A factory method to instantiate a concrete CommChannelBase
+        for inference requests"""
+
+    def get_backbone(self) -> t.Optional[FeatureStore]:
+        """Attach to the backbone feature store using the descriptor found in
+        the environment variable `_SMARTSIM_INFRA_BACKBONE`. The backbone is
+        a standalone, system-created feature store used to share internal
+        information among MLI components.
+
+        :returns: The attached feature store via `_SMARTSIM_INFRA_BACKBONE`
+        """
+        descriptor = os.getenv(self.BACKBONE_ENV_VAR, "")
+
+        if not descriptor:
+            logger.warning("No backbone descriptor is configured")
+            return None
+
+        if self._featurestore_factory is None:
+            logger.warning(
+                "No feature store factory is configured. Backbone not created."
+            )
+            return None
+
+        self.backbone = self._featurestore_factory(descriptor)
+        return self.backbone
+
+    def get_queue(self) -> t.Optional[CommChannelBase]:
+        """Attach to a queue-like communication channel using the descriptor
+        found in the environment variable `_SMARTSIM_REQUEST_QUEUE`.
+
+        :returns: The attached queue specified via `_SMARTSIM_REQUEST_QUEUE`
+        """
+        descriptor = os.getenv(self.REQUEST_QUEUE_ENV_VAR, "")
+
+        if not descriptor:
+            logger.warning("No queue descriptor is configured")
+            return None
+
+        if self._queue_factory is None:
+            logger.warning("No queue factory is configured")
+            return None
+
+        self.queue = self._queue_factory(descriptor)
+        return self.queue
diff --git a/smartsim/_core/mli/infrastructure/storage/__init__.py b/smartsim/_core/mli/infrastructure/storage/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
new file mode 100644
index 0000000000..b12d7b11b4
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -0,0 +1,259 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import itertools
+import os
+import time
+import typing as t
+
+# pylint: disable=import-error
+# isort: off
+import dragon.data.ddict.ddict as dragon_ddict
+
+# isort: on
+
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+    DragonFeatureStore,
+)
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class BackboneFeatureStore(DragonFeatureStore):
+    """A DragonFeatureStore wrapper with utility methods for accessing shared
+    information stored in the MLI backbone feature store."""
+
+    MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS"
+    """Unique key used in the backbone to locate the consumer list"""
+    MLI_REGISTRAR_CONSUMER = "_SMARTIM_MLI_REGISTRAR_CONSUMER"
+    """Unique key used in the backbone to locate the registration consumer"""
+    MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE"
+    """Unique key used in the backbone to locate MLI work queue"""
+    MLI_BACKBONE = "_SMARTSIM_INFRA_BACKBONE"
+    """Unique key used in the backbone to locate the backbone feature store"""
+    _CREATED_ON = "creation"
+    """Unique key used in the backbone to locate the creation date of the
+    feature store"""
+    _DEFAULT_WAIT_TIMEOUT = 1.0
+    """The default wait time (in seconds) for blocking requests to
+    the feature store"""
+
+    def __init__(
+        self,
+        storage: dragon_ddict.DDict,
+        allow_reserved_writes: bool = False,
+    ) -> None:
+        """Initialize the DragonFeatureStore instance.
+
+        :param storage: A distributed dictionary to be used as the underlying
+        storage mechanism of the feature store
+        :param allow_reserved_writes: Whether reserved writes are allowed
+        """
+        super().__init__(storage)
+        self._enable_reserved_writes = allow_reserved_writes
+
+        self._record_creation_data()
+
+    @property
+    def wait_timeout(self) -> float:
+        """Retrieve the wait timeout for this feature store. The wait timeout is
+        applied to all calls to `wait_for`.
+
+        :returns: The wait timeout (in seconds).
+        """
+        return self._wait_timeout
+
+    @wait_timeout.setter
+    def wait_timeout(self, value: float) -> None:
+        """Set the wait timeout (in seconds) for this feature store. The wait
+        timeout is applied to all calls to `wait_for`.
+
+        :param value: The new value to set
+        """
+        self._wait_timeout = value
+
+    @property
+    def notification_channels(self) -> t.Sequence[str]:
+        """Retrieve descriptors for all registered MLI notification channels.
+
+        :returns: The list of channel descriptors
+        """
+        if self.MLI_NOTIFY_CONSUMERS in self:
+            stored_consumers = self[self.MLI_NOTIFY_CONSUMERS]
+            return str(stored_consumers).split(",")
+        return []
+
+    @notification_channels.setter
+    def notification_channels(self, values: t.Sequence[str]) -> None:
+        """Set the notification channels to be sent events.
+
+        :param values: The list of channel descriptors to save
+        """
+        self[self.MLI_NOTIFY_CONSUMERS] = ",".join(
+            [str(value) for value in values if value]
+        )
+
+    @property
+    def backend_channel(self) -> t.Optional[str]:
+        """Retrieve the channel descriptor used to register event consumers.
+
+        :returns: The channel descriptor"""
+        if self.MLI_REGISTRAR_CONSUMER in self:
+            return str(self[self.MLI_REGISTRAR_CONSUMER])
+        return None
+
+    @backend_channel.setter
+    def backend_channel(self, value: str) -> None:
+        """Set the channel used to register event consumers.
+
+        :param value: The stringified channel descriptor"""
+        self[self.MLI_REGISTRAR_CONSUMER] = value
+
+    @property
+    def worker_queue(self) -> t.Optional[str]:
+        """Retrieve the channel descriptor used to send work to MLI worker managers.
+
+        :returns: The channel descriptor, if found. Otherwise, `None`"""
+        if self.MLI_WORKER_QUEUE in self:
+            return str(self[self.MLI_WORKER_QUEUE])
+        return None
+
+    @worker_queue.setter
+    def worker_queue(self, value: str) -> None:
+        """Set the channel descriptor used to send work to MLI worker managers.
+
+        :param value: The channel descriptor"""
+        self[self.MLI_WORKER_QUEUE] = value
+
+    @property
+    def creation_date(self) -> str:
+        """Return the creation date for the backbone feature store.
+
+        :returns: The string-formatted date when feature store was created"""
+        return str(self[self._CREATED_ON])
+
+    def _record_creation_data(self) -> None:
+        """Write the creation timestamp to the feature store."""
+        if self._CREATED_ON not in self:
+            if not self._allow_reserved_writes:
+                logger.warning(
+                    "Recorded creation from a write-protected backbone instance"
+                )
+            self[self._CREATED_ON] = str(time.time())
+
+        os.environ[self.MLI_BACKBONE] = self.descriptor
+
+    @classmethod
+    def from_writable_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "BackboneFeatureStore":
+        """A factory method that creates an instance from a descriptor string.
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached DragonFeatureStore
+        :raises SmartSimError: if attachment to DragonFeatureStore fails
+        """
+        try:
+            return BackboneFeatureStore(dragon_ddict.DDict.attach(descriptor), True)
+        except Exception as ex:
+            raise SmartSimError(
+                f"Error creating backbone feature store: {descriptor}"
+            ) from ex
+
+    def _check_wait_timeout(
+        self, start_time: float, timeout: float, indicators: t.Dict[str, bool]
+    ) -> None:
+        """Perform timeout verification.
+
+        :param start_time: the start time to use for elapsed calculation
+        :param timeout: the timeout (in seconds)
+        :param indicators: latest retrieval status for requested keys
+        :raises SmartSimError: If the timeout elapses before all values are
+        retrieved
+        """
+        elapsed = time.time() - start_time
+        if timeout and elapsed > timeout:
+            raise SmartSimError(
+                f"Backbone {self.descriptor=} timeout after {elapsed} "
+                f"seconds retrieving keys: {indicators}"
+            )
+
+    def wait_for(
+        self, keys: t.List[str], timeout: float = _DEFAULT_WAIT_TIMEOUT
+    ) -> t.Dict[str, t.Union[str, bytes, None]]:
+        """Perform a blocking wait until all specified keys have been found
+        in the backbone.
+
+        :param keys: The required collection of keys to retrieve
+        :param timeout: The maximum wait time in seconds
+        :returns: Dictionary containing the keys and values requested
+        :raises SmartSimError: If the timeout elapses without retrieving
+         all requested keys
+        """
+        if timeout < 0:
+            timeout = self._DEFAULT_WAIT_TIMEOUT
+            logger.info(f"Using default wait_for timeout: {timeout}s")
+
+        if not keys:
+            return {}
+
+        values: t.Dict[str, t.Union[str, bytes, None]] = {k: None for k in set(keys)}
+        is_found = {k: False for k in values.keys()}
+
+        backoff = (0.1, 0.2, 0.4, 0.8)
+        backoff_iter = itertools.cycle(backoff)
+        start_time = time.time()
+
+        while not all(is_found.values()):
+            delay = next(backoff_iter)
+
+            for key in [k for k, v in is_found.items() if not v]:
+                try:
+                    values[key] = self[key]
+                    is_found[key] = True
+                except Exception:
+                    if delay == backoff[-1]:
+                        logger.debug(f"Re-attempting `{key}` retrieval in {delay}s")
+
+            if all(is_found.values()):
+                logger.debug(f"wait_for({keys}) retrieved all keys")
+                continue
+
+            self._check_wait_timeout(start_time, timeout, is_found)
+            time.sleep(delay)
+
+        return values
+
+    def get_env(self) -> t.Dict[str, str]:
+        """Returns a dictionary populated with environment variables necessary to
+        connect a process to the existing backbone instance.
+
+        :returns: The dictionary populated with env vars
+        """
+        return {self.MLI_BACKBONE: self.descriptor}
diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
new file mode 100644
index 0000000000..24f2221c87
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
@@ -0,0 +1,126 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+# pylint: disable=import-error
+# isort: off
+import dragon.data.ddict.ddict as dragon_ddict
+
+# isort: on
+
+from smartsim._core.mli.infrastructure.storage.dragon_util import (
+    ddict_to_descriptor,
+    descriptor_to_ddict,
+)
+from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore
+from smartsim.error import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class DragonFeatureStore(FeatureStore):
+    """A feature store backed by a dragon distributed dictionary."""
+
+    def __init__(self, storage: "dragon_ddict.DDict") -> None:
+        """Initialize the DragonFeatureStore instance.
+
+        :param storage: A distributed dictionary to be used as the underlying
+        storage mechanism of the feature store"""
+        if storage is None:
+            raise ValueError(
+                "Storage is required when instantiating a DragonFeatureStore."
+            )
+
+        descriptor = ""
+        if isinstance(storage, dragon_ddict.DDict):
+            descriptor = ddict_to_descriptor(storage)
+
+        super().__init__(descriptor)
+        self._storage: t.Dict[str, t.Union[str, bytes]] = storage
+        """The underlying storage mechanism of the DragonFeatureStore; a
+        distributed, in-memory key-value store"""
+
+    def _get(self, key: str) -> t.Union[str, bytes]:
+        """Retrieve a value from the underlying storage mechanism.
+
+        :param key: The unique key that identifies the resource
+        :returns: The value identified by the key
+        :raises KeyError: If the key has not been used to store a value
+        """
+        try:
+            return self._storage[key]
+        except dragon_ddict.DDictError as e:
+            raise KeyError(f"Key not found in FeatureStore: {key}") from e
+
+    def _set(self, key: str, value: t.Union[str, bytes]) -> None:
+        """Store a value into the underlying storage mechanism.
+
+        :param key: The unique key that identifies the resource
+        :param value: The value to store
+        :returns: The value identified by the key
+        """
+        self._storage[key] = value
+
+    def _contains(self, key: str) -> bool:
+        """Determine if the storage mechanism contains a given key.
+
+        :param key: The unique key that identifies the resource
+        :returns: True if the key is defined, False otherwise
+        """
+        return key in self._storage
+
+    def pop(self, key: str) -> t.Union[str, bytes, None]:
+        """Remove the value from the dictionary and return the value.
+
+        :param key: Dictionary key to retrieve
+        :returns: The value held at the key if it exists, otherwise `None
+        `"""
+        try:
+            return self._storage.pop(key)
+        except dragon_ddict.DDictError:
+            return None
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "DragonFeatureStore":
+        """A factory method that creates an instance from a descriptor string.
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached DragonFeatureStore
+        :raises SmartSimError: If attachment to DragonFeatureStore fails
+        """
+        try:
+            logger.debug(f"Attaching to FeatureStore with descriptor: {descriptor}")
+            storage = descriptor_to_ddict(descriptor)
+            return cls(storage)
+        except Exception as ex:
+            raise SmartSimError(
+                f"Error creating dragon feature store from descriptor: {descriptor}"
+            ) from ex
diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_util.py b/smartsim/_core/mli/infrastructure/storage/dragon_util.py
new file mode 100644
index 0000000000..50d15664c0
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/storage/dragon_util.py
@@ -0,0 +1,101 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# pylint: disable=import-error
+# isort: off
+import dragon.data.ddict.ddict as dragon_ddict
+
+# isort: on
+
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+def ddict_to_descriptor(ddict: dragon_ddict.DDict) -> str:
+    """Convert a DDict to a descriptor string.
+
+    :param ddict: The dragon dictionary to convert
+    :returns: The descriptor string
+    :raises ValueError: If a ddict is not provided
+    """
+    if ddict is None:
+        raise ValueError("DDict is not available to create a descriptor")
+
+    # unlike other dragon objects, the dictionary serializes to a string
+    # instead of bytes
+    return str(ddict.serialize())
+
+
+def descriptor_to_ddict(descriptor: str) -> dragon_ddict.DDict:
+    """Create and attach a new DDict instance given
+    the string-encoded descriptor.
+
+    :param descriptor: The descriptor of a dictionary to attach to
+    :returns: The attached dragon dictionary"""
+    return dragon_ddict.DDict.attach(descriptor)
+
+
+def create_ddict(
+    num_nodes: int, mgr_per_node: int, mem_per_node: int
+) -> dragon_ddict.DDict:
+    """Create a distributed dragon dictionary.
+
+    :param num_nodes: The number of distributed nodes to distribute the dictionary to.
+     At least one node is required.
+    :param mgr_per_node: The number of manager processes per node
+    :param mem_per_node: The amount of memory (in megabytes) to allocate per node. Total
+     memory available will be calculated as `num_nodes * node_mem`
+
+    :returns: The instantiated dragon dictionary
+    :raises ValueError: If invalid num_nodes is supplied
+    :raises ValueError: If invalid mem_per_node is supplied
+    :raises ValueError: If invalid mgr_per_node is supplied
+    """
+    if num_nodes < 1:
+        raise ValueError("A dragon dictionary must have at least 1 node")
+
+    if mgr_per_node < 1:
+        raise ValueError("A dragon dict requires at least 2 managers per ndode")
+
+    if mem_per_node < dragon_ddict.DDICT_MIN_SIZE:
+        raise ValueError(
+            "A dragon dictionary requires at least "
+            f"{dragon_ddict.DDICT_MIN_SIZE / 1024} MB"
+        )
+
+    mem_total = num_nodes * mem_per_node
+
+    logger.debug(
+        f"Creating dragon dictionary with {num_nodes} nodes, {mem_total} MB memory"
+    )
+
+    distributed_dict = dragon_ddict.DDict(num_nodes, mgr_per_node, total_mem=mem_total)
+    logger.debug(
+        "Successfully created dragon dictionary with "
+        f"{num_nodes} nodes, {mem_total} MB total memory"
+    )
+    return distributed_dict
diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py
new file mode 100644
index 0000000000..ebca07ed4e
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py
@@ -0,0 +1,224 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import enum
+import typing as t
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class ReservedKeys(str, enum.Enum):
+    """Contains constants used to identify all featurestore keys that
+    may not be to used by users. Avoids overwriting system data."""
+
+    MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS"
+    """Storage location for the list of registered consumers that will receive
+    events from an EventBroadcaster"""
+
+    MLI_REGISTRAR_CONSUMER = "_SMARTIM_MLI_REGISTRAR_CONSUMER"
+    """Storage location for the channel used to send messages directly to
+    the MLI backend"""
+
+    MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE"
+    """Storage location for the channel used to send work requests 
+    to the available worker managers"""
+
+    @classmethod
+    def contains(cls, value: str) -> bool:
+        """Convert a string representation into an enumeration member.
+
+        :param value: The string to convert
+        :returns: The enumeration member if the conversion succeeded, otherwise None
+        """
+        try:
+            cls(value)
+        except ValueError:
+            return False
+
+        return True
+
+
+@dataclass(frozen=True)
+class TensorKey:
+    """A key,descriptor pair enabling retrieval of an item from a feature store."""
+
+    key: str
+    """The unique key of an item in a feature store"""
+    descriptor: str
+    """The unique identifier of the feature store containing the key"""
+
+    def __post_init__(self) -> None:
+        """Ensure the key and descriptor have at least one character.
+
+        :raises ValueError: If key or descriptor are empty strings
+        """
+        if len(self.key) < 1:
+            raise ValueError("Key must have at least one character.")
+        if len(self.descriptor) < 1:
+            raise ValueError("Descriptor must have at least one character.")
+
+
+@dataclass(frozen=True)
+class ModelKey:
+    """A key,descriptor pair enabling retrieval of an item from a feature store."""
+
+    key: str
+    """The unique key of an item in a feature store"""
+    descriptor: str
+    """The unique identifier of the feature store containing the key"""
+
+    def __post_init__(self) -> None:
+        """Ensure the key and descriptor have at least one character.
+
+        :raises ValueError: If key or descriptor are empty strings
+        """
+        if len(self.key) < 1:
+            raise ValueError("Key must have at least one character.")
+        if len(self.descriptor) < 1:
+            raise ValueError("Descriptor must have at least one character.")
+
+
+class FeatureStore(ABC):
+    """Abstract base class providing the common interface for retrieving
+    values from a feature store implementation."""
+
+    def __init__(self, descriptor: str, allow_reserved_writes: bool = False) -> None:
+        """Initialize the feature store.
+
+        :param descriptor: The stringified version of a storage descriptor
+        :param allow_reserved_writes: Override the default behavior of blocking
+        writes to reserved keys
+        """
+        self._enable_reserved_writes = allow_reserved_writes
+        """Flag used to ensure that any keys written by the system to a feature store
+        are not overwritten by user code. Disabled by default. Subclasses must set the
+        value intentionally."""
+        self._descriptor = descriptor
+        """Stringified version of the unique ID enabling a client to connect
+        to the feature store"""
+
+    def _check_reserved(self, key: str) -> None:
+        """A utility method used to verify access to write to a reserved key
+        in the FeatureStore. Used by subclasses in __setitem___ implementations.
+
+        :param key: A key to compare to the reserved keys
+        :raises SmartSimError: If the key is reserved
+        """
+        if not self._enable_reserved_writes and ReservedKeys.contains(key):
+            raise SmartSimError(
+                "Use of reserved key denied. "
+                "Unable to overwrite system configuration"
+            )
+
+    def __getitem__(self, key: str) -> t.Union[str, bytes]:
+        """Retrieve an item using key.
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: An item in the FeatureStore
+        :raises SmartSimError: If retrieving fails
+        """
+        try:
+            return self._get(key)
+        except KeyError:
+            raise
+        except Exception as ex:
+            # note: explicitly avoid round-trip to check for key existence
+            raise SmartSimError(
+                f"Could not get value for existing key {key}, error:\n{ex}"
+            ) from ex
+
+    def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None:
+        """Assign a value using key.
+
+        :param key: Unique key of an item to set in the feature store
+        :param value: Value to persist in the feature store
+        """
+        self._check_reserved(key)
+        self._set(key, value)
+
+    def __contains__(self, key: str) -> bool:
+        """Membership operator to test for a key existing within the feature store.
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise
+        """
+        return self._contains(key)
+
+    @abstractmethod
+    def _get(self, key: str) -> t.Union[str, bytes]:
+        """Retrieve a value from the underlying storage mechanism.
+
+        :param key: The unique key that identifies the resource
+        :returns: The value identified by the key
+        :raises KeyError: If the key has not been used to store a value
+        """
+
+    @abstractmethod
+    def _set(self, key: str, value: t.Union[str, bytes]) -> None:
+        """Store a value into the underlying storage mechanism.
+
+        :param key: The unique key that identifies the resource
+        :param value: The value to store
+        """
+
+    @abstractmethod
+    def _contains(self, key: str) -> bool:
+        """Determine if the storage mechanism contains a given key.
+
+        :param key: The unique key that identifies the resource
+        :returns: `True` if the key is defined, `False` otherwise
+        """
+
+    @property
+    def _allow_reserved_writes(self) -> bool:
+        """Return the boolean flag indicating if writing to reserved keys is
+        enabled for this feature store.
+
+        :returns: `True` if enabled, `False` otherwise
+        """
+        return self._enable_reserved_writes
+
+    @_allow_reserved_writes.setter
+    def _allow_reserved_writes(self, value: bool) -> None:
+        """Modify the boolean flag indicating if writing to reserved keys is
+        enabled for this feature store.
+
+        :param value: The new value to set for the flag
+        """
+        self._enable_reserved_writes = value
+
+    @property
+    def descriptor(self) -> str:
+        """Unique identifier enabling a client to connect to the feature store.
+
+        :returns: A descriptor encoded as a string
+        """
+        return self._descriptor
diff --git a/smartsim/_core/mli/infrastructure/worker/__init__.py b/smartsim/_core/mli/infrastructure/worker/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
new file mode 100644
index 0000000000..64e94e5eb6
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -0,0 +1,276 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+
+import numpy as np
+import torch
+
+# pylint: disable=import-error
+from dragon.managed_memory import MemoryAlloc, MemoryPool
+
+from .....error import SmartSimError
+from .....log import get_logger
+from ...mli_schemas.tensor import tensor_capnp
+from .worker import (
+    ExecuteResult,
+    FetchInputResult,
+    FetchModelResult,
+    LoadModelResult,
+    MachineLearningWorkerBase,
+    RequestBatch,
+    TransformInputResult,
+    TransformOutputResult,
+)
+
+# pylint: enable=import-error
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(4)
+logger = get_logger(__name__)
+
+
+class TorchWorker(MachineLearningWorkerBase):
+    """A worker that executes a PyTorch model."""
+
+    @staticmethod
+    def load_model(
+        batch: RequestBatch, fetch_result: FetchModelResult, device: str
+    ) -> LoadModelResult:
+        """Given a loaded MachineLearningModel, ensure it is loaded into
+        device memory.
+
+        :param request: The request that triggered the pipeline
+        :param device: The device on which the model must be placed
+        :returns: LoadModelResult wrapping the model loaded for the request
+        :raises ValueError: If model reference object is not found
+        :raises RuntimeError: If loading and evaluating the model failed
+        """
+        if fetch_result.model_bytes:
+            model_bytes = fetch_result.model_bytes
+        elif batch.raw_model and batch.raw_model.data:
+            model_bytes = batch.raw_model.data
+        else:
+            raise ValueError("Unable to load model without reference object")
+
+        device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
+        for old, new in device_to_torch.items():
+            device = device.replace(old, new)
+
+        buffer = io.BytesIO(initial_bytes=model_bytes)
+        try:
+            with torch.no_grad():
+                model = torch.jit.load(buffer, map_location=device)  # type: ignore
+                model.eval()
+        except Exception as e:
+            raise RuntimeError(
+                "Failed to load and evaluate the model: "
+                f"Model key {batch.model_id.key}, Device {device}"
+            ) from e
+        result = LoadModelResult(model)
+        return result
+
+    @staticmethod
+    def transform_input(
+        batch: RequestBatch,
+        fetch_results: list[FetchInputResult],
+        mem_pool: MemoryPool,
+    ) -> TransformInputResult:
+        """Given a collection of data, perform a transformation on the data and put
+        the raw tensor data on a MemoryPool allocation.
+
+        :param request: The request that triggered the pipeline
+        :param fetch_result: Raw outputs from fetching inputs out of a feature store
+        :param mem_pool: The memory pool used to access batched input tensors
+        :returns: The transformed inputs wrapped in a TransformInputResult
+        :raises ValueError: If tensors cannot be reconstructed
+        :raises IndexError: If index out of range
+        """
+        results: list[torch.Tensor] = []
+        total_samples = 0
+        slices: list[slice] = []
+
+        all_dims: list[list[int]] = []
+        all_dtypes: list[str] = []
+        if fetch_results[0].meta is None:
+            raise ValueError("Cannot reconstruct tensor without meta information")
+        # Traverse inputs to get total number of samples and compute slices
+        # Assumption: first dimension is samples, all tensors in the same input
+        # have same number of samples
+        # thus we only look at the first tensor for each input
+        for res_idx, fetch_result in enumerate(fetch_results):
+            if fetch_result.meta is None or any(
+                item_meta is None for item_meta in fetch_result.meta
+            ):
+                raise ValueError("Cannot reconstruct tensor without meta information")
+            first_tensor_desc: tensor_capnp.TensorDescriptor = fetch_result.meta[0]
+            num_samples = first_tensor_desc.dimensions[0]
+            slices.append(slice(total_samples, total_samples + num_samples))
+            total_samples = total_samples + num_samples
+
+            if res_idx == len(fetch_results) - 1:
+                # For each tensor in the last input, get remaining dimensions
+                # Assumptions: all inputs have the same number of tensors and
+                # last N-1 dimensions match across inputs for corresponding tensors
+                # thus: resulting array will be of size (num_samples, all_other_dims)
+                for item_meta in fetch_result.meta:
+                    tensor_desc: tensor_capnp.TensorDescriptor = item_meta
+                    tensor_dims = list(tensor_desc.dimensions)
+                    all_dims.append([total_samples, *tensor_dims[1:]])
+                    all_dtypes.append(str(tensor_desc.dataType))
+
+        for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)):
+            itemsize = np.empty((1), dtype=dtype).itemsize
+            alloc_size = int(np.prod(dims) * itemsize)
+            mem_alloc = mem_pool.alloc(alloc_size)
+            mem_view = mem_alloc.get_memview()
+            try:
+                mem_view[:alloc_size] = b"".join(
+                    [
+                        fetch_result.inputs[result_tensor_idx]
+                        for fetch_result in fetch_results
+                    ]
+                )
+            except IndexError as e:
+                raise IndexError(
+                    "Error accessing elements in fetch_result.inputs "
+                    f"with index {result_tensor_idx}"
+                ) from e
+
+            results.append(mem_alloc.serialize())
+
+        return TransformInputResult(results, slices, all_dims, all_dtypes)
+
+    # pylint: disable-next=unused-argument
+    @staticmethod
+    def execute(
+        batch: RequestBatch,
+        load_result: LoadModelResult,
+        transform_result: TransformInputResult,
+        device: str,
+    ) -> ExecuteResult:
+        """Execute an ML model on inputs transformed for use by the model.
+
+        :param batch: The batch of requests that triggered the pipeline
+        :param load_result: The result of loading the model onto device memory
+        :param transform_result: The result of transforming inputs for model consumption
+        :param device: The device on which the model will be executed
+        :returns: The result of inference wrapped in an ExecuteResult
+        :raises SmartSimError: If model is not loaded
+        :raises IndexError: If memory slicing is out of range
+        :raises ValueError: If tensor creation fails or is unable to evaluate the model
+        """
+        if not load_result.model:
+            raise SmartSimError("Model must be loaded to execute")
+        device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
+        for old, new in device_to_torch.items():
+            device = device.replace(old, new)
+
+        tensors = []
+        mem_allocs = []
+        for transformed, dims, dtype in zip(
+            transform_result.transformed, transform_result.dims, transform_result.dtypes
+        ):
+            mem_alloc = MemoryAlloc.attach(transformed)
+            mem_allocs.append(mem_alloc)
+            itemsize = np.empty((1), dtype=dtype).itemsize
+            try:
+                tensors.append(
+                    torch.from_numpy(
+                        np.frombuffer(
+                            mem_alloc.get_memview()[0 : np.prod(dims) * itemsize],
+                            dtype=dtype,
+                        ).reshape(dims)
+                    )
+                )
+            except IndexError as e:
+                raise IndexError("Error during memory slicing") from e
+            except Exception as e:
+                raise ValueError("Error during tensor creation") from e
+
+        model: torch.nn.Module = load_result.model
+        try:
+            with torch.no_grad():
+                model.eval()
+                results = [
+                    model(
+                        *[
+                            tensor.to(device, non_blocking=True).detach()
+                            for tensor in tensors
+                        ]
+                    )
+                ]
+        except Exception as e:
+            raise ValueError(
+                f"Error while evaluating the model: Model {batch.model_id.key}"
+            ) from e
+
+        transform_result.transformed = []
+
+        execute_result = ExecuteResult(results, transform_result.slices)
+        for mem_alloc in mem_allocs:
+            mem_alloc.free()
+        return execute_result
+
+    @staticmethod
+    def transform_output(
+        batch: RequestBatch,
+        execute_result: ExecuteResult,
+    ) -> list[TransformOutputResult]:
+        """Given inference results, perform transformations required to
+        transmit results to the requestor.
+
+        :param batch: The batch of requests that triggered the pipeline
+        :param execute_result: The result of inference wrapped in an ExecuteResult
+        :returns: A list of transformed outputs
+        :raises IndexError: If indexing is out of range
+        :raises ValueError: If transforming output fails
+        """
+        transformed_list: list[TransformOutputResult] = []
+        cpu_predictions = [
+            prediction.cpu() for prediction in execute_result.predictions
+        ]
+        for result_slice in execute_result.slices:
+            transformed = []
+            for cpu_item in cpu_predictions:
+                try:
+                    transformed.append(cpu_item[result_slice].numpy().tobytes())
+
+                    # todo: need the shape from latest schemas added here.
+                    transformed_list.append(
+                        TransformOutputResult(transformed, None, "c", "float32")
+                    )  # fixme
+                except IndexError as e:
+                    raise IndexError(
+                        f"Error accessing elements: result_slice {result_slice}"
+                    ) from e
+                except Exception as e:
+                    raise ValueError("Error transforming output") from e
+
+        execute_result.predictions = []
+
+        return transformed_list
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
new file mode 100644
index 0000000000..9556b8e438
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -0,0 +1,646 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# pylint: disable=import-error
+from dragon.managed_memory import MemoryPool
+
+# isort: off
+# isort: on
+
+import typing as t
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+from .....error import SmartSimError
+from .....log import get_logger
+from ...comm.channel.channel import CommChannelBase
+from ...message_handler import MessageHandler
+from ...mli_schemas.model.model_capnp import Model
+from ..storage.feature_store import FeatureStore, ModelKey, TensorKey
+
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+    from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor
+
+logger = get_logger(__name__)
+
+# Placeholder
+ModelIdentifier = ModelKey
+
+
+class InferenceRequest:
+    """Internal representation of an inference request from a client."""
+
+    def __init__(
+        self,
+        model_key: t.Optional[ModelKey] = None,
+        callback: t.Optional[CommChannelBase] = None,
+        raw_inputs: t.Optional[t.List[bytes]] = None,
+        input_keys: t.Optional[t.List[TensorKey]] = None,
+        input_meta: t.Optional[t.List[t.Any]] = None,
+        output_keys: t.Optional[t.List[TensorKey]] = None,
+        raw_model: t.Optional[Model] = None,
+        batch_size: int = 0,
+    ):
+        """Initialize the InferenceRequest.
+
+        :param model_key: A tuple containing a (key, descriptor) pair
+        :param callback: The channel used for notification of inference completion
+        :param raw_inputs: Raw bytes of tensor inputs
+        :param input_keys: A list of tuples containing a (key, descriptor) pair
+        :param input_meta: Metadata about the input data
+        :param output_keys: A list of tuples containing a (key, descriptor) pair
+        :param raw_model: Raw bytes of an ML model
+        :param batch_size: The batch size to apply when batching
+        """
+        self.model_key = model_key
+        """A tuple containing a (key, descriptor) pair"""
+        self.raw_model = raw_model
+        """Raw bytes of an ML model"""
+        self.callback = callback
+        """The channel used for notification of inference completion"""
+        self.raw_inputs = raw_inputs or []
+        """Raw bytes of tensor inputs"""
+        self.input_keys = input_keys or []
+        """A list of tuples containing a (key, descriptor) pair"""
+        self.input_meta = input_meta or []
+        """Metadata about the input data"""
+        self.output_keys = output_keys or []
+        """A list of tuples containing a (key, descriptor) pair"""
+        self.batch_size = batch_size
+        """The batch size to apply when batching"""
+
+    @property
+    def has_raw_model(self) -> bool:
+        """Check if the InferenceRequest contains a raw_model.
+
+        :returns: True if raw_model is not None, False otherwise
+        """
+        return self.raw_model is not None
+
+    @property
+    def has_model_key(self) -> bool:
+        """Check if the InferenceRequest contains a model_key.
+
+        :returns: True if model_key is not None, False otherwise
+        """
+        return self.model_key is not None
+
+    @property
+    def has_raw_inputs(self) -> bool:
+        """Check if the InferenceRequest contains raw_inputs.
+
+        :returns: True if raw_outputs is not None and is not an empty list,
+        False otherwise
+        """
+        return self.raw_inputs is not None and bool(self.raw_inputs)
+
+    @property
+    def has_input_keys(self) -> bool:
+        """Check if the InferenceRequest contains input_keys.
+
+        :returns: True if input_keys is not None and is not an empty list,
+        False otherwise
+        """
+        return self.input_keys is not None and bool(self.input_keys)
+
+    @property
+    def has_output_keys(self) -> bool:
+        """Check if the InferenceRequest contains output_keys.
+
+        :returns: True if output_keys is not None and is not an empty list,
+        False otherwise
+        """
+        return self.output_keys is not None and bool(self.output_keys)
+
+    @property
+    def has_input_meta(self) -> bool:
+        """Check if the InferenceRequest contains input_meta.
+
+        :returns: True if input_meta is not None and is not an empty list,
+        False otherwise
+        """
+        return self.input_meta is not None and bool(self.input_meta)
+
+
+class InferenceReply:
+    """Internal representation of the reply to a client request for inference."""
+
+    def __init__(
+        self,
+        outputs: t.Optional[t.Collection[t.Any]] = None,
+        output_keys: t.Optional[t.Collection[TensorKey]] = None,
+        status_enum: "Status" = "running",
+        message: str = "In progress",
+    ) -> None:
+        """Initialize the InferenceReply.
+
+        :param outputs: List of output data
+        :param output_keys: List of keys used for output data
+        :param status_enum: Status of the reply
+        :param message: Status message that corresponds with the status enum
+        """
+        self.outputs: t.Collection[t.Any] = outputs or []
+        """List of output data"""
+        self.output_keys: t.Collection[t.Optional[TensorKey]] = output_keys or []
+        """List of keys used for output data"""
+        self.status_enum = status_enum
+        """Status of the reply"""
+        self.message = message
+        """Status message that corresponds with the status enum"""
+
+    @property
+    def has_outputs(self) -> bool:
+        """Check if the InferenceReply contains outputs.
+
+        :returns: True if outputs is not None and is not an empty list,
+        False otherwise
+        """
+        return self.outputs is not None and bool(self.outputs)
+
+    @property
+    def has_output_keys(self) -> bool:
+        """Check if the InferenceReply contains output_keys.
+
+        :returns: True if output_keys is not None and is not an empty list,
+        False otherwise
+        """
+        return self.output_keys is not None and bool(self.output_keys)
+
+
+class LoadModelResult:
+    """A wrapper around a loaded model."""
+
+    def __init__(self, model: t.Any) -> None:
+        """Initialize the LoadModelResult.
+
+        :param model: The loaded model
+        """
+        self.model = model
+        """The loaded model (e.g. a TensorFlow, PyTorch, ONNX, etc. model)"""
+
+
+class TransformInputResult:
+    """A wrapper around a transformed batch of input tensors"""
+
+    def __init__(
+        self,
+        result: t.Any,
+        slices: list[slice],
+        dims: list[list[int]],
+        dtypes: list[str],
+    ) -> None:
+        """Initialize the TransformInputResult.
+
+        :param result: List of Dragon MemoryAlloc objects on which
+        the tensors are stored
+        :param slices: The slices that represent which portion of the
+        input tensors belongs to which request
+        :param dims: Dimension of the transformed tensors
+        :param dtypes: Data type of transformed tensors
+        """
+        self.transformed = result
+        """List of Dragon MemoryAlloc objects on which the tensors are stored"""
+        self.slices = slices
+        """Each slice represents which portion of the input tensors belongs to
+        which request"""
+        self.dims = dims
+        """Dimension of the transformed tensors"""
+        self.dtypes = dtypes
+        """Data type of transformed tensors"""
+
+
+class ExecuteResult:
+    """A wrapper around inference results."""
+
+    def __init__(self, result: t.Any, slices: list[slice]) -> None:
+        """Initialize the ExecuteResult.
+
+        :param result: Result of the execution
+        :param slices: The slices that represent which portion of the input
+        tensors belongs to which request
+        """
+        self.predictions = result
+        """Result of the execution"""
+        self.slices = slices
+        """The slices that represent which portion of the input
+        tensors belongs to which request"""
+
+
+class FetchInputResult:
+    """A wrapper around fetched inputs."""
+
+    def __init__(self, result: t.List[bytes], meta: t.Optional[t.List[t.Any]]) -> None:
+        """Initialize the FetchInputResult.
+
+        :param result: List of input tensor bytes
+        :param meta: List of metadata that corresponds with the inputs
+        """
+        self.inputs = result
+        """List of input tensor bytes"""
+        self.meta = meta
+        """List of metadata that corresponds with the inputs"""
+
+
+class TransformOutputResult:
+    """A wrapper around inference results transformed for transmission."""
+
+    def __init__(
+        self, result: t.Any, shape: t.Optional[t.List[int]], order: str, dtype: str
+    ) -> None:
+        """Initialize the TransformOutputResult.
+
+        :param result: Transformed output results
+        :param shape: Shape of output results
+        :param order: Order of output results
+        :param dtype: Datatype of output results
+        """
+        self.outputs = result
+        """Transformed output results"""
+        self.shape = shape
+        """Shape of output results"""
+        self.order = order
+        """Order of output results"""
+        self.dtype = dtype
+        """Datatype of output results"""
+
+
+class CreateInputBatchResult:
+    """A wrapper around inputs batched into a single request."""
+
+    def __init__(self, result: t.Any) -> None:
+        """Initialize the CreateInputBatchResult.
+
+        :param result: Inputs batched into a single request
+        """
+        self.batch = result
+        """Inputs batched into a single request"""
+
+
+class FetchModelResult:
+    """A wrapper around raw fetched models."""
+
+    def __init__(self, result: bytes) -> None:
+        """Initialize the FetchModelResult.
+
+        :param result: The raw fetched model
+        """
+        self.model_bytes: bytes = result
+        """The raw fetched model"""
+
+
+@dataclass
+class RequestBatch:
+    """A batch of aggregated inference requests."""
+
+    requests: list[InferenceRequest]
+    """List of InferenceRequests in the batch"""
+    inputs: t.Optional[TransformInputResult]
+    """Transformed batch of input tensors"""
+    model_id: "ModelIdentifier"
+    """Model (key, descriptor) tuple"""
+
+    @property
+    def has_valid_requests(self) -> bool:
+        """Returns whether the batch contains at least one request.
+
+        :returns: True if at least one request is available
+        """
+        return len(self.requests) > 0
+
+    @property
+    def has_raw_model(self) -> bool:
+        """Returns whether the batch has a raw model.
+
+        :returns: True if the batch has a raw model
+        """
+        return self.raw_model is not None
+
+    @property
+    def raw_model(self) -> t.Optional[t.Any]:
+        """Returns the raw model to use to execute for this batch
+        if it is available.
+
+        :returns: A model if available, otherwise None"""
+        if self.has_valid_requests:
+            return self.requests[0].raw_model
+        return None
+
+    @property
+    def input_keys(self) -> t.List[TensorKey]:
+        """All input keys available in this batch's requests.
+
+        :returns: All input keys belonging to requests in this batch"""
+        keys = []
+        for request in self.requests:
+            keys.extend(request.input_keys)
+
+        return keys
+
+    @property
+    def output_keys(self) -> t.List[TensorKey]:
+        """All output keys available in this batch's requests.
+
+        :returns: All output keys belonging to requests in this batch"""
+        keys = []
+        for request in self.requests:
+            keys.extend(request.output_keys)
+
+        return keys
+
+
+class MachineLearningWorkerCore:
+    """Basic functionality of ML worker that is shared across all worker types."""
+
+    @staticmethod
+    def deserialize_message(
+        data_blob: bytes,
+        callback_factory: t.Callable[[str], CommChannelBase],
+    ) -> InferenceRequest:
+        """Deserialize a message from a byte stream into an InferenceRequest.
+
+        :param data_blob: The byte stream to deserialize
+        :param callback_factory: A factory method that can create an instance
+        of the desired concrete comm channel type
+        :returns: The raw input message deserialized into an InferenceRequest
+        """
+        request = MessageHandler.deserialize_request(data_blob)
+        model_key: t.Optional[ModelKey] = None
+        model_bytes: t.Optional[Model] = None
+
+        if request.model.which() == "key":
+            model_key = ModelKey(
+                key=request.model.key.key,
+                descriptor=request.model.key.descriptor,
+            )
+        elif request.model.which() == "data":
+            model_bytes = request.model.data
+
+        callback_key = request.replyChannel.descriptor
+        comm_channel = callback_factory(callback_key)
+        input_keys: t.Optional[t.List[TensorKey]] = None
+        input_bytes: t.Optional[t.List[bytes]] = None
+        output_keys: t.Optional[t.List[TensorKey]] = None
+        input_meta: t.Optional[t.List[TensorDescriptor]] = None
+
+        if request.input.which() == "keys":
+            input_keys = [
+                TensorKey(key=value.key, descriptor=value.descriptor)
+                for value in request.input.keys
+            ]
+        elif request.input.which() == "descriptors":
+            input_meta = request.input.descriptors  # type: ignore
+
+        if request.output:
+            output_keys = [
+                TensorKey(key=value.key, descriptor=value.descriptor)
+                for value in request.output
+            ]
+
+        inference_request = InferenceRequest(
+            model_key=model_key,
+            callback=comm_channel,
+            raw_inputs=input_bytes,
+            input_meta=input_meta,
+            input_keys=input_keys,
+            output_keys=output_keys,
+            raw_model=model_bytes,
+            batch_size=0,
+        )
+        return inference_request
+
+    @staticmethod
+    def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]:
+        """Assemble the output information based on whether the output
+        information will be in the form of TensorKeys or TensorDescriptors.
+
+        :param reply: The reply that the output belongs to
+        :returns: The list of prepared outputs, depending on the output
+        information needed in the reply
+        """
+        prepared_outputs: t.List[t.Any] = []
+        if reply.has_output_keys:
+            for value in reply.output_keys:
+                if not value:
+                    continue
+                msg_key = MessageHandler.build_tensor_key(value.key, value.descriptor)
+                prepared_outputs.append(msg_key)
+        elif reply.has_outputs:
+            for _ in reply.outputs:
+                msg_tensor_desc = MessageHandler.build_tensor_descriptor(
+                    "c",
+                    "float32",
+                    [1],
+                )
+                prepared_outputs.append(msg_tensor_desc)
+        return prepared_outputs
+
+    @staticmethod
+    def fetch_model(
+        batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore]
+    ) -> FetchModelResult:
+        """Given a resource key, retrieve the raw model from a feature store.
+
+        :param batch: The batch of requests that triggered the pipeline
+        :param feature_stores: Available feature stores used for persistence
+        :returns: Raw bytes of the model
+        :raises SmartSimError: If neither a key or a model are provided or the
+        model cannot be retrieved from the feature store
+        :raises ValueError: If a feature store is not available and a raw
+        model is not provided
+        """
+        # All requests in the same batch share the model
+        if batch.raw_model:
+            return FetchModelResult(batch.raw_model.data)
+
+        if not feature_stores:
+            raise ValueError("Feature store is required for model retrieval")
+
+        if batch.model_id is None:
+            raise SmartSimError(
+                "Key must be provided to retrieve model from feature store"
+            )
+
+        key, fsd = batch.model_id.key, batch.model_id.descriptor
+
+        try:
+            feature_store = feature_stores[fsd]
+            raw_bytes: bytes = t.cast(bytes, feature_store[key])
+            return FetchModelResult(raw_bytes)
+        except (FileNotFoundError, KeyError) as ex:
+            logger.exception(ex)
+            raise SmartSimError(f"Model could not be retrieved with key {key}") from ex
+
+    @staticmethod
+    def fetch_inputs(
+        batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore]
+    ) -> t.List[FetchInputResult]:
+        """Given a collection of ResourceKeys, identify the physical location
+        and input metadata.
+
+        :param batch: The batch of requests that triggered the pipeline
+        :param feature_stores: Available feature stores used for persistence
+        :returns: The fetched input
+        :raises ValueError: If neither an input key or an input tensor are provided
+        :raises SmartSimError: If a tensor for a given key cannot be retrieved
+        """
+        fetch_results = []
+        for request in batch.requests:
+            if request.raw_inputs:
+                fetch_results.append(
+                    FetchInputResult(request.raw_inputs, request.input_meta)
+                )
+                continue
+
+            if not feature_stores:
+                raise ValueError("No input and no feature store provided")
+
+            if request.has_input_keys:
+                data: t.List[bytes] = []
+
+                for fs_key in request.input_keys:
+                    try:
+                        feature_store = feature_stores[fs_key.descriptor]
+                        tensor_bytes = t.cast(bytes, feature_store[fs_key.key])
+                        data.append(tensor_bytes)
+                    except KeyError as ex:
+                        logger.exception(ex)
+                        raise SmartSimError(
+                            f"Tensor could not be retrieved with key {fs_key.key}"
+                        ) from ex
+                fetch_results.append(
+                    FetchInputResult(data, meta=None)
+                )  # fixme: need to get both tensor and descriptor
+                continue
+
+            raise ValueError("No input source")
+
+        return fetch_results
+
+    @staticmethod
+    def place_output(
+        request: InferenceRequest,
+        transform_result: TransformOutputResult,
+        feature_stores: t.Dict[str, FeatureStore],
+    ) -> t.Collection[t.Optional[TensorKey]]:
+        """Given a collection of data, make it available as a shared resource in the
+        feature store.
+
+        :param request: The request that triggered the pipeline
+        :param transform_result: Transformed version of the inference result
+        :param feature_stores: Available feature stores used for persistence
+        :returns: A collection of keys that were placed in the feature store
+        :raises ValueError: If a feature store is not provided
+        """
+        if not feature_stores:
+            raise ValueError("Feature store is required for output persistence")
+
+        keys: t.List[t.Optional[TensorKey]] = []
+        # need to decide how to get back to original sub-batch inputs so they can be
+        # accurately placed, datum might need to include this.
+
+        # Consider parallelizing all PUT feature_store operations
+        for fs_key, v in zip(request.output_keys, transform_result.outputs):
+            feature_store = feature_stores[fs_key.descriptor]
+            feature_store[fs_key.key] = v
+            keys.append(fs_key)
+
+        return keys
+
+
+class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC):
+    """Abstract base class providing contract for a machine learning
+    worker implementation."""
+
+    @staticmethod
+    @abstractmethod
+    def load_model(
+        batch: RequestBatch, fetch_result: FetchModelResult, device: str
+    ) -> LoadModelResult:
+        """Given the raw bytes of an ML model that were fetched, ensure
+        it is loaded into device memory.
+
+        :param request: The request that triggered the pipeline
+        :param fetch_result: The result of a fetch-model operation; contains
+        the raw bytes of the ML model.
+        :param device: The device on which the model must be placed
+        :returns: LoadModelResult wrapping the model loaded for the request
+        :raises ValueError: If model reference object is not found
+        :raises RuntimeError: If loading and evaluating the model failed
+        """
+
+    @staticmethod
+    @abstractmethod
+    def transform_input(
+        batch: RequestBatch,
+        fetch_results: list[FetchInputResult],
+        mem_pool: MemoryPool,
+    ) -> TransformInputResult:
+        """Given a collection of data, perform a transformation on the data and put
+        the raw tensor data on a MemoryPool allocation.
+
+        :param batch: The request that triggered the pipeline
+        :param fetch_result: Raw outputs from fetching inputs out of a feature store
+        :param mem_pool: The memory pool used to access batched input tensors
+        :returns: The transformed inputs wrapped in a TransformInputResult
+        :raises ValueError: If tensors cannot be reconstructed
+        :raises IndexError: If index out of range
+        """
+
+    @staticmethod
+    @abstractmethod
+    def execute(
+        batch: RequestBatch,
+        load_result: LoadModelResult,
+        transform_result: TransformInputResult,
+        device: str,
+    ) -> ExecuteResult:
+        """Execute an ML model on inputs transformed for use by the model.
+
+        :param batch: The batch of requests that triggered the pipeline
+        :param load_result: The result of loading the model onto device memory
+        :param transform_result: The result of transforming inputs for model consumption
+        :param device: The device on which the model will be executed
+        :returns: The result of inference wrapped in an ExecuteResult
+        :raises SmartSimError: If model is not loaded
+        :raises IndexError: If memory slicing is out of range
+        :raises ValueError: If tensor creation fails or is unable to evaluate the model
+        """
+
+    @staticmethod
+    @abstractmethod
+    def transform_output(
+        batch: RequestBatch, execute_result: ExecuteResult
+    ) -> t.List[TransformOutputResult]:
+        """Given inference results, perform transformations required to
+        transmit results to the requestor.
+
+        :param batch: The batch of requests that triggered the pipeline
+        :param execute_result: The result of inference wrapped in an ExecuteResult
+        :returns: A list of transformed outputs
+        :raises IndexError: If indexing is out of range
+        :raises ValueError: If transforming output fails
+        """
diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py
new file mode 100644
index 0000000000..e3d46a7ab3
--- /dev/null
+++ b/smartsim/_core/mli/message_handler.py
@@ -0,0 +1,602 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import typing as t
+
+from .mli_schemas.data import data_references_capnp
+from .mli_schemas.model import model_capnp
+from .mli_schemas.request import request_capnp
+from .mli_schemas.request.request_attributes import request_attributes_capnp
+from .mli_schemas.response import response_capnp
+from .mli_schemas.response.response_attributes import response_attributes_capnp
+from .mli_schemas.tensor import tensor_capnp
+
+
+class MessageHandler:
+    """Utility methods for transforming capnproto messages to and from
+    internal representations.
+    """
+
+    @staticmethod
+    def build_tensor_descriptor(
+        order: "tensor_capnp.Order",
+        data_type: "tensor_capnp.NumericalType",
+        dimensions: t.List[int],
+    ) -> tensor_capnp.TensorDescriptor:
+        """
+        Builds a TensorDescriptor message using the provided
+        order, data type, and dimensions.
+
+        :param order: Order of the tensor, such as row-major (c) or column-major (f)
+        :param data_type: Data type of the tensor
+        :param dimensions: Dimensions of the tensor
+        :returns: The TensorDescriptor
+        :raises ValueError: If building fails
+        """
+        try:
+            description = tensor_capnp.TensorDescriptor.new_message()
+            description.order = order
+            description.dataType = data_type
+            description.dimensions = dimensions
+        except Exception as e:
+            raise ValueError("Error building tensor descriptor.") from e
+
+        return description
+
+    @staticmethod
+    def build_output_tensor_descriptor(
+        order: "tensor_capnp.Order",
+        keys: t.List["data_references_capnp.TensorKey"],
+        data_type: "tensor_capnp.ReturnNumericalType",
+        dimensions: t.List[int],
+    ) -> tensor_capnp.OutputDescriptor:
+        """
+        Builds an OutputDescriptor message using the provided
+        order, data type, and dimensions.
+
+        :param order: Order of the tensor, such as row-major (c) or column-major (f)
+        :param keys: List of TensorKey to apply transorm descriptor to
+        :param data_type: Tranform data type of the tensor
+        :param dimensions: Transform dimensions of the tensor
+        :returns: The OutputDescriptor
+        :raises ValueError: If building fails
+        """
+        try:
+            description = tensor_capnp.OutputDescriptor.new_message()
+            description.order = order
+            description.optionalKeys = keys
+            description.optionalDatatype = data_type
+            description.optionalDimension = dimensions
+
+        except Exception as e:
+            raise ValueError("Error building output tensor descriptor.") from e
+
+        return description
+
+    @staticmethod
+    def build_tensor_key(key: str, descriptor: str) -> data_references_capnp.TensorKey:
+        """
+        Builds a new TensorKey message with the provided key.
+
+        :param key: String to set the TensorKey
+        :param descriptor: A descriptor identifying the feature store
+        containing the key
+        :returns: The TensorKey
+        :raises ValueError: If building fails
+        """
+        try:
+            tensor_key = data_references_capnp.TensorKey.new_message()
+            tensor_key.key = key
+            tensor_key.descriptor = descriptor
+        except Exception as e:
+            raise ValueError("Error building tensor key.") from e
+        return tensor_key
+
+    @staticmethod
+    def build_model(data: bytes, name: str, version: str) -> model_capnp.Model:
+        """
+        Builds a new Model message with the provided data, name, and version.
+
+        :param data: Model data
+        :param name: Model name
+        :param version: Model version
+        :returns: The Model
+        :raises ValueError: If building fails
+        """
+        try:
+            model = model_capnp.Model.new_message()
+            model.data = data
+            model.name = name
+            model.version = version
+        except Exception as e:
+            raise ValueError("Error building model.") from e
+        return model
+
+    @staticmethod
+    def build_model_key(key: str, descriptor: str) -> data_references_capnp.ModelKey:
+        """
+        Builds a new ModelKey message with the provided key.
+
+        :param key: String to set the ModelKey
+        :param descriptor: A descriptor identifying the feature store
+        containing the key
+        :returns: The ModelKey
+        :raises ValueError: If building fails
+        """
+        try:
+            model_key = data_references_capnp.ModelKey.new_message()
+            model_key.key = key
+            model_key.descriptor = descriptor
+        except Exception as e:
+            raise ValueError("Error building tensor key.") from e
+        return model_key
+
+    @staticmethod
+    def build_torch_request_attributes(
+        tensor_type: "request_attributes_capnp.TorchTensorType",
+    ) -> request_attributes_capnp.TorchRequestAttributes:
+        """
+        Builds a new TorchRequestAttributes message with the provided tensor type.
+
+        :param tensor_type: Type of the tensor passed in
+        :returns: The TorchRequestAttributes
+        :raises ValueError: If building fails
+        """
+        try:
+            attributes = request_attributes_capnp.TorchRequestAttributes.new_message()
+            attributes.tensorType = tensor_type
+        except Exception as e:
+            raise ValueError("Error building Torch request attributes.") from e
+        return attributes
+
+    @staticmethod
+    def build_tf_request_attributes(
+        name: str, tensor_type: "request_attributes_capnp.TFTensorType"
+    ) -> request_attributes_capnp.TensorFlowRequestAttributes:
+        """
+        Builds a new TensorFlowRequestAttributes message with
+        the provided name and tensor type.
+
+        :param name: Name of the tensor
+        :param tensor_type: Type of the tensor passed in
+        :returns: The TensorFlowRequestAttributes
+        :raises ValueError: If building fails
+        """
+        try:
+            attributes = (
+                request_attributes_capnp.TensorFlowRequestAttributes.new_message()
+            )
+            attributes.name = name
+            attributes.tensorType = tensor_type
+        except Exception as e:
+            raise ValueError("Error building TensorFlow request attributes.") from e
+        return attributes
+
+    @staticmethod
+    def build_torch_response_attributes() -> (
+        response_attributes_capnp.TorchResponseAttributes
+    ):
+        """
+        Builds a new TorchResponseAttributes message.
+
+        :returns: The TorchResponseAttributes
+        """
+        return response_attributes_capnp.TorchResponseAttributes.new_message()
+
+    @staticmethod
+    def build_tf_response_attributes() -> (
+        response_attributes_capnp.TensorFlowResponseAttributes
+    ):
+        """
+        Builds a new TensorFlowResponseAttributes message.
+
+        :returns: The TensorFlowResponseAttributes
+        """
+        return response_attributes_capnp.TensorFlowResponseAttributes.new_message()
+
+    @staticmethod
+    def _assign_model(
+        request: request_capnp.Request,
+        model: t.Union[data_references_capnp.ModelKey, model_capnp.Model],
+    ) -> None:
+        """
+        Assigns a model to the supplied request.
+
+        :param request: Request being built
+        :param model: Model to be assigned
+        :raises ValueError: If building fails
+        """
+        try:
+            class_name = model.schema.node.displayName.split(":")[-1]  # type: ignore
+            if class_name == "Model":
+                request.model.data = model  # type: ignore
+            elif class_name == "ModelKey":
+                request.model.key = model  # type: ignore
+            else:
+                raise ValueError("""Invalid custom attribute class name.
+                        Expected 'Model' or 'ModelKey'.""")
+        except Exception as e:
+            raise ValueError("Error building model portion of request.") from e
+
+    @staticmethod
+    def _assign_reply_channel(
+        request: request_capnp.Request, reply_channel: str
+    ) -> None:
+        """
+        Assigns a reply channel to the supplied request.
+
+        :param request: Request being built
+        :param reply_channel: Reply channel to be assigned
+        :raises ValueError: If building fails
+        """
+        try:
+            request.replyChannel.descriptor = reply_channel
+        except Exception as e:
+            raise ValueError("Error building reply channel portion of request.") from e
+
+    @staticmethod
+    def _assign_inputs(
+        request: request_capnp.Request,
+        inputs: t.Union[
+            t.List[data_references_capnp.TensorKey],
+            t.List[tensor_capnp.TensorDescriptor],
+        ],
+    ) -> None:
+        """
+        Assigns inputs to the supplied request.
+
+        :param request: Request being built
+        :param inputs: Inputs to be assigned
+        :raises ValueError: If building fails
+        """
+        try:
+            if inputs:
+                display_name = inputs[0].schema.node.displayName  # type: ignore
+                input_class_name = display_name.split(":")[-1]
+                if input_class_name == "TensorDescriptor":
+                    request.input.descriptors = inputs  # type: ignore
+                elif input_class_name == "TensorKey":
+                    request.input.keys = inputs  # type: ignore
+                else:
+                    raise ValueError("""Invalid input class name. Expected
+                        'TensorDescriptor' or 'TensorKey'.""")
+        except Exception as e:
+            raise ValueError("Error building inputs portion of request.") from e
+
+    @staticmethod
+    def _assign_outputs(
+        request: request_capnp.Request,
+        outputs: t.List[data_references_capnp.TensorKey],
+    ) -> None:
+        """
+        Assigns outputs to the supplied request.
+
+        :param request: Request being built
+        :param outputs: Outputs to be assigned
+        :raises ValueError: If building fails
+        """
+        try:
+            request.output = outputs
+
+        except Exception as e:
+            raise ValueError("Error building outputs portion of request.") from e
+
+    @staticmethod
+    def _assign_output_descriptors(
+        request: request_capnp.Request,
+        output_descriptors: t.List[tensor_capnp.OutputDescriptor],
+    ) -> None:
+        """
+        Assigns a list of output tensor descriptors to the supplied request.
+
+        :param request: Request being built
+        :param output_descriptors: Output descriptors to be assigned
+        :raises ValueError: If building fails
+        """
+        try:
+            request.outputDescriptors = output_descriptors
+        except Exception as e:
+            raise ValueError(
+                "Error building the output descriptors portion of request."
+            ) from e
+
+    @staticmethod
+    def _assign_custom_request_attributes(
+        request: request_capnp.Request,
+        custom_attrs: t.Union[
+            request_attributes_capnp.TorchRequestAttributes,
+            request_attributes_capnp.TensorFlowRequestAttributes,
+            None,
+        ],
+    ) -> None:
+        """
+        Assigns request attributes to the supplied request.
+
+        :param request: Request being built
+        :param custom_attrs: Custom attributes to be assigned
+        :raises ValueError: If building fails
+        """
+        try:
+            if custom_attrs is None:
+                request.customAttributes.none = custom_attrs
+            else:
+                custom_attribute_class_name = (
+                    custom_attrs.schema.node.displayName.split(":")[-1]  # type: ignore
+                )
+                if custom_attribute_class_name == "TorchRequestAttributes":
+                    request.customAttributes.torch = custom_attrs  # type: ignore
+                elif custom_attribute_class_name == "TensorFlowRequestAttributes":
+                    request.customAttributes.tf = custom_attrs  # type: ignore
+                else:
+                    raise ValueError("""Invalid custom attribute class name.
+                        Expected 'TensorFlowRequestAttributes' or
+                        'TorchRequestAttributes'.""")
+        except Exception as e:
+            raise ValueError(
+                "Error building custom attributes portion of request."
+            ) from e
+
+    @staticmethod
+    def build_request(
+        reply_channel: str,
+        model: t.Union[data_references_capnp.ModelKey, model_capnp.Model],
+        inputs: t.Union[
+            t.List[data_references_capnp.TensorKey],
+            t.List[tensor_capnp.TensorDescriptor],
+        ],
+        outputs: t.List[data_references_capnp.TensorKey],
+        output_descriptors: t.List[tensor_capnp.OutputDescriptor],
+        custom_attributes: t.Union[
+            request_attributes_capnp.TorchRequestAttributes,
+            request_attributes_capnp.TensorFlowRequestAttributes,
+            None,
+        ],
+    ) -> request_capnp.RequestBuilder:
+        """
+        Builds the request message.
+
+        :param reply_channel: Reply channel to be assigned to request
+        :param model: Model to be assigned to request
+        :param inputs: Inputs to be assigned to request
+        :param outputs: Outputs to be assigned to request
+        :param output_descriptors: Output descriptors to be assigned to request
+        :param custom_attributes: Custom attributes to be assigned to request
+        :returns: The Request
+        """
+        request = request_capnp.Request.new_message()
+        MessageHandler._assign_reply_channel(request, reply_channel)
+        MessageHandler._assign_model(request, model)
+        MessageHandler._assign_inputs(request, inputs)
+        MessageHandler._assign_outputs(request, outputs)
+        MessageHandler._assign_output_descriptors(request, output_descriptors)
+        MessageHandler._assign_custom_request_attributes(request, custom_attributes)
+        return request
+
+    @staticmethod
+    def serialize_request(request: request_capnp.RequestBuilder) -> bytes:
+        """
+        Serializes a built request message.
+
+        :param request: Request to be serialized
+        :returns: Serialized request bytes
+        :raises ValueError: If serialization fails
+        """
+        display_name = request.schema.node.displayName  # type: ignore
+        class_name = display_name.split(":")[-1]
+        if class_name != "Request":
+            raise ValueError(
+                "Error serializing the request. Value passed in is not "
+                f"a request: {class_name}"
+            )
+        try:
+            return request.to_bytes()
+        except Exception as e:
+            raise ValueError("Error serializing the request") from e
+
+    @staticmethod
+    def deserialize_request(request_bytes: bytes) -> request_capnp.Request:
+        """
+        Deserializes a serialized request message.
+
+        :param request_bytes: Bytes to be deserialized into a request
+        :returns: Deserialized request
+        :raises ValueError: If deserialization fails
+        """
+        try:
+            bytes_message = request_capnp.Request.from_bytes(
+                request_bytes, traversal_limit_in_words=2**63
+            )
+
+            with bytes_message as message:
+                return message
+        except Exception as e:
+            raise ValueError("Error deserializing the request") from e
+
+    @staticmethod
+    def _assign_status(
+        response: response_capnp.Response, status: "response_capnp.Status"
+    ) -> None:
+        """
+        Assigns a status to the supplied response.
+
+        :param response: Response being built
+        :param status: Status to be assigned
+        :raises ValueError: If building fails
+        """
+        try:
+            response.status = status
+        except Exception as e:
+            raise ValueError("Error assigning status to response.") from e
+
+    @staticmethod
+    def _assign_message(response: response_capnp.Response, message: str) -> None:
+        """
+        Assigns a message to the supplied response.
+
+        :param response: Response being built
+        :param message: Message to be assigned
+        :raises ValueError: If building fails
+        """
+        try:
+            response.message = message
+        except Exception as e:
+            raise ValueError("Error assigning message to response.") from e
+
+    @staticmethod
+    def _assign_result(
+        response: response_capnp.Response,
+        result: t.Union[
+            t.List[tensor_capnp.TensorDescriptor],
+            t.List[data_references_capnp.TensorKey],
+            None,
+        ],
+    ) -> None:
+        """
+        Assigns a result to the supplied response.
+
+        :param response: Response being built
+        :param result: Result to be assigned
+        :raises ValueError: If building fails
+        """
+        try:
+            if result:
+                first_result = result[0]
+                display_name = first_result.schema.node.displayName  # type: ignore
+                result_class_name = display_name.split(":")[-1]
+                if result_class_name == "TensorDescriptor":
+                    response.result.descriptors = result  # type: ignore
+                elif result_class_name == "TensorKey":
+                    response.result.keys = result  # type: ignore
+                else:
+                    raise ValueError("""Invalid custom attribute class name.
+                        Expected 'TensorDescriptor' or 'TensorKey'.""")
+        except Exception as e:
+            raise ValueError("Error assigning result to response.") from e
+
+    @staticmethod
+    def _assign_custom_response_attributes(
+        response: response_capnp.Response,
+        custom_attrs: t.Union[
+            response_attributes_capnp.TorchResponseAttributes,
+            response_attributes_capnp.TensorFlowResponseAttributes,
+            None,
+        ],
+    ) -> None:
+        """
+        Assigns custom attributes to the supplied response.
+
+        :param response: Response being built
+        :param custom_attrs: Custom attributes to be assigned
+        :raises ValueError: If building fails
+        """
+        try:
+            if custom_attrs is None:
+                response.customAttributes.none = custom_attrs
+            else:
+                custom_attribute_class_name = (
+                    custom_attrs.schema.node.displayName.split(":")[-1]  # type: ignore
+                )
+                if custom_attribute_class_name == "TorchResponseAttributes":
+                    response.customAttributes.torch = custom_attrs  # type: ignore
+                elif custom_attribute_class_name == "TensorFlowResponseAttributes":
+                    response.customAttributes.tf = custom_attrs  # type: ignore
+                else:
+                    raise ValueError("""Invalid custom attribute class name.
+                        Expected 'TensorFlowResponseAttributes' or
+                        'TorchResponseAttributes'.""")
+        except Exception as e:
+            raise ValueError("Error assigning custom attributes to response.") from e
+
+    @staticmethod
+    def build_response(
+        status: "response_capnp.Status",
+        message: str,
+        result: t.Union[
+            t.List[tensor_capnp.TensorDescriptor],
+            t.List[data_references_capnp.TensorKey],
+            None,
+        ],
+        custom_attributes: t.Union[
+            response_attributes_capnp.TorchResponseAttributes,
+            response_attributes_capnp.TensorFlowResponseAttributes,
+            None,
+        ],
+    ) -> response_capnp.ResponseBuilder:
+        """
+        Builds the response message.
+
+        :param status: Status to be assigned to response
+        :param message: Message to be assigned to response
+        :param result: Result to be assigned to response
+        :param custom_attributes: Custom attributes to be assigned to response
+        :returns: The Response
+        """
+        response = response_capnp.Response.new_message()
+        MessageHandler._assign_status(response, status)
+        MessageHandler._assign_message(response, message)
+        MessageHandler._assign_result(response, result)
+        MessageHandler._assign_custom_response_attributes(response, custom_attributes)
+        return response
+
+    @staticmethod
+    def serialize_response(response: response_capnp.ResponseBuilder) -> bytes:
+        """
+        Serializes a built response message.
+
+        :param response: Response to be serialized
+        :returns: Serialized response bytes
+        :raises ValueError: If serialization fails
+        """
+        display_name = response.schema.node.displayName  # type: ignore
+        class_name = display_name.split(":")[-1]
+        if class_name != "Response":
+            raise ValueError(
+                "Error serializing the response. Value passed in is not "
+                f"a response: {class_name}"
+            )
+        try:
+            return response.to_bytes()
+        except Exception as e:
+            raise ValueError("Error serializing the response") from e
+
+    @staticmethod
+    def deserialize_response(response_bytes: bytes) -> response_capnp.Response:
+        """
+        Deserializes a serialized response message.
+
+        :param response_bytes: Bytes to be deserialized into a response
+        :returns: Deserialized response
+        :raises ValueError: If deserialization fails
+        """
+        try:
+            bytes_message = response_capnp.Response.from_bytes(
+                response_bytes, traversal_limit_in_words=2**63
+            )
+
+            with bytes_message as message:
+                return message
+
+        except Exception as e:
+            raise ValueError("Error deserializing the response") from e
diff --git a/smartsim/_core/mli/mli_schemas/data/data_references.capnp b/smartsim/_core/mli/mli_schemas/data/data_references.capnp
new file mode 100644
index 0000000000..65293be7b2
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/data/data_references.capnp
@@ -0,0 +1,37 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0x8ca69fd1aacb6668;
+
+struct ModelKey {
+  key @0 :Text;
+  descriptor @1 :Text;
+}
+
+struct TensorKey {
+  key @0 :Text;
+  descriptor @1 :Text;
+}
diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py
new file mode 100644
index 0000000000..099d10c438
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py
@@ -0,0 +1,41 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `data_references.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "data_references.capnp"))
+ModelKey = capnp.load(module_file).ModelKey
+ModelKeyBuilder = ModelKey
+ModelKeyReader = ModelKey
+TensorKey = capnp.load(module_file).TensorKey
+TensorKeyBuilder = TensorKey
+TensorKeyReader = TensorKey
diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi
new file mode 100644
index 0000000000..a5e318a556
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi
@@ -0,0 +1,107 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `data_references.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator
+
+class ModelKey:
+    key: str
+    descriptor: str
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[ModelKeyReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> ModelKeyReader: ...
+    @staticmethod
+    def new_message() -> ModelKeyBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class ModelKeyReader(ModelKey):
+    def as_builder(self) -> ModelKeyBuilder: ...
+
+class ModelKeyBuilder(ModelKey):
+    @staticmethod
+    def from_dict(dictionary: dict) -> ModelKeyBuilder: ...
+    def copy(self) -> ModelKeyBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> ModelKeyReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
+
+class TensorKey:
+    key: str
+    descriptor: str
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TensorKeyReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TensorKeyReader: ...
+    @staticmethod
+    def new_message() -> TensorKeyBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TensorKeyReader(TensorKey):
+    def as_builder(self) -> TensorKeyBuilder: ...
+
+class TensorKeyBuilder(TensorKey):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TensorKeyBuilder: ...
+    def copy(self) -> TensorKeyBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TensorKeyReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/model/__init__.py b/smartsim/_core/mli/mli_schemas/model/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/smartsim/_core/mli/mli_schemas/model/model.capnp b/smartsim/_core/mli/mli_schemas/model/model.capnp
new file mode 100644
index 0000000000..fc9ed73663
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/model/model.capnp
@@ -0,0 +1,33 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0xaefb9301e14ba4bd;
+
+struct Model {
+  data @0 :Data;
+  name @1 :Text;
+  version @2 :Text;
+}
diff --git a/smartsim/_core/mli/mli_schemas/model/model_capnp.py b/smartsim/_core/mli/mli_schemas/model/model_capnp.py
new file mode 100644
index 0000000000..be2c276c23
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/model/model_capnp.py
@@ -0,0 +1,38 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `model.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "model.capnp"))
+Model = capnp.load(module_file).Model
+ModelBuilder = Model
+ModelReader = Model
diff --git a/smartsim/_core/mli/mli_schemas/model/model_capnp.pyi b/smartsim/_core/mli/mli_schemas/model/model_capnp.pyi
new file mode 100644
index 0000000000..6ca53a3579
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/model/model_capnp.pyi
@@ -0,0 +1,72 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `model.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator
+
+class Model:
+    data: bytes
+    name: str
+    version: str
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[ModelReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> ModelReader: ...
+    @staticmethod
+    def new_message() -> ModelBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class ModelReader(Model):
+    def as_builder(self) -> ModelBuilder: ...
+
+class ModelBuilder(Model):
+    @staticmethod
+    def from_dict(dictionary: dict) -> ModelBuilder: ...
+    def copy(self) -> ModelBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> ModelReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/request/request.capnp b/smartsim/_core/mli/mli_schemas/request/request.capnp
new file mode 100644
index 0000000000..26d9542d9f
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request.capnp
@@ -0,0 +1,55 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0xa27f0152c7bb299e;
+
+using Tensors = import "../tensor/tensor.capnp";
+using RequestAttributes = import "request_attributes/request_attributes.capnp";
+using DataRef = import "../data/data_references.capnp";
+using Models = import "../model/model.capnp";
+
+struct ChannelDescriptor {
+  descriptor @0 :Text;
+}
+
+struct Request {
+  replyChannel @0 :ChannelDescriptor;
+  model :union {
+    key @1 :DataRef.ModelKey;
+    data @2 :Models.Model;
+  }
+  input :union {
+    keys @3 :List(DataRef.TensorKey);
+    descriptors @4 :List(Tensors.TensorDescriptor);
+  }
+  output @5 :List(DataRef.TensorKey);
+  outputDescriptors @6 :List(Tensors.OutputDescriptor);
+  customAttributes :union {
+    torch @7 :RequestAttributes.TorchRequestAttributes;
+    tf @8 :RequestAttributes.TensorFlowRequestAttributes;
+    none @9 :Void;
+  }
+}
diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp
new file mode 100644
index 0000000000..f0a319f0a3
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp
@@ -0,0 +1,49 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0xdd14d8ba5c06743f;
+
+enum TorchTensorType {
+  nested @0; # ragged
+  sparse @1;
+  tensor @2; # "normal" tensor
+}
+
+enum TFTensorType {
+  ragged @0;
+  sparse @1;
+  variable @2;
+  constant @3;
+}
+
+struct TorchRequestAttributes {
+  tensorType @0 :TorchTensorType;
+}
+
+struct TensorFlowRequestAttributes {
+  name @0 :Text;
+  tensorType @1 :TFTensorType;
+}
diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py
new file mode 100644
index 0000000000..8969f38457
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py
@@ -0,0 +1,41 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `request_attributes.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "request_attributes.capnp"))
+TorchRequestAttributes = capnp.load(module_file).TorchRequestAttributes
+TorchRequestAttributesBuilder = TorchRequestAttributes
+TorchRequestAttributesReader = TorchRequestAttributes
+TensorFlowRequestAttributes = capnp.load(module_file).TensorFlowRequestAttributes
+TensorFlowRequestAttributesBuilder = TensorFlowRequestAttributes
+TensorFlowRequestAttributesReader = TensorFlowRequestAttributes
diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi
new file mode 100644
index 0000000000..c474de4b4f
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi
@@ -0,0 +1,109 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `request_attributes.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator, Literal
+
+TorchTensorType = Literal["nested", "sparse", "tensor"]
+TFTensorType = Literal["ragged", "sparse", "variable", "constant"]
+
+class TorchRequestAttributes:
+    tensorType: TorchTensorType
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TorchRequestAttributesReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TorchRequestAttributesReader: ...
+    @staticmethod
+    def new_message() -> TorchRequestAttributesBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TorchRequestAttributesReader(TorchRequestAttributes):
+    def as_builder(self) -> TorchRequestAttributesBuilder: ...
+
+class TorchRequestAttributesBuilder(TorchRequestAttributes):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TorchRequestAttributesBuilder: ...
+    def copy(self) -> TorchRequestAttributesBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TorchRequestAttributesReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
+
+class TensorFlowRequestAttributes:
+    name: str
+    tensorType: TFTensorType
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TensorFlowRequestAttributesReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TensorFlowRequestAttributesReader: ...
+    @staticmethod
+    def new_message() -> TensorFlowRequestAttributesBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TensorFlowRequestAttributesReader(TensorFlowRequestAttributes):
+    def as_builder(self) -> TensorFlowRequestAttributesBuilder: ...
+
+class TensorFlowRequestAttributesBuilder(TensorFlowRequestAttributes):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TensorFlowRequestAttributesBuilder: ...
+    def copy(self) -> TensorFlowRequestAttributesBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TensorFlowRequestAttributesReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.py b/smartsim/_core/mli/mli_schemas/request/request_capnp.py
new file mode 100644
index 0000000000..90b8ce194e
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.py
@@ -0,0 +1,41 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `request.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "request.capnp"))
+ChannelDescriptor = capnp.load(module_file).ChannelDescriptor
+ChannelDescriptorBuilder = ChannelDescriptor
+ChannelDescriptorReader = ChannelDescriptor
+Request = capnp.load(module_file).Request
+RequestBuilder = Request
+RequestReader = Request
diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi
new file mode 100644
index 0000000000..2aab80b1d0
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi
@@ -0,0 +1,319 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `request.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator, Literal, Sequence, overload
+
+from ..data.data_references_capnp import (
+    ModelKey,
+    ModelKeyBuilder,
+    ModelKeyReader,
+    TensorKey,
+    TensorKeyBuilder,
+    TensorKeyReader,
+)
+from ..model.model_capnp import Model, ModelBuilder, ModelReader
+from ..tensor.tensor_capnp import (
+    OutputDescriptor,
+    OutputDescriptorBuilder,
+    OutputDescriptorReader,
+    TensorDescriptor,
+    TensorDescriptorBuilder,
+    TensorDescriptorReader,
+)
+from .request_attributes.request_attributes_capnp import (
+    TensorFlowRequestAttributes,
+    TensorFlowRequestAttributesBuilder,
+    TensorFlowRequestAttributesReader,
+    TorchRequestAttributes,
+    TorchRequestAttributesBuilder,
+    TorchRequestAttributesReader,
+)
+
+class ChannelDescriptor:
+    descriptor: str
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[ChannelDescriptorReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> ChannelDescriptorReader: ...
+    @staticmethod
+    def new_message() -> ChannelDescriptorBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class ChannelDescriptorReader(ChannelDescriptor):
+    def as_builder(self) -> ChannelDescriptorBuilder: ...
+
+class ChannelDescriptorBuilder(ChannelDescriptor):
+    @staticmethod
+    def from_dict(dictionary: dict) -> ChannelDescriptorBuilder: ...
+    def copy(self) -> ChannelDescriptorBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> ChannelDescriptorReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
+
+class Request:
+    class Model:
+        key: ModelKey | ModelKeyBuilder | ModelKeyReader
+        data: Model | ModelBuilder | ModelReader
+        def which(self) -> Literal["key", "data"]: ...
+        @overload
+        def init(self, name: Literal["key"]) -> ModelKey: ...
+        @overload
+        def init(self, name: Literal["data"]) -> Model: ...
+        @staticmethod
+        @contextmanager
+        def from_bytes(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Iterator[Request.ModelReader]: ...
+        @staticmethod
+        def from_bytes_packed(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Request.ModelReader: ...
+        @staticmethod
+        def new_message() -> Request.ModelBuilder: ...
+        def to_dict(self) -> dict: ...
+
+    class ModelReader(Request.Model):
+        key: ModelKeyReader
+        data: ModelReader
+        def as_builder(self) -> Request.ModelBuilder: ...
+
+    class ModelBuilder(Request.Model):
+        key: ModelKey | ModelKeyBuilder | ModelKeyReader
+        data: Model | ModelBuilder | ModelReader
+        @staticmethod
+        def from_dict(dictionary: dict) -> Request.ModelBuilder: ...
+        def copy(self) -> Request.ModelBuilder: ...
+        def to_bytes(self) -> bytes: ...
+        def to_bytes_packed(self) -> bytes: ...
+        def to_segments(self) -> list[bytes]: ...
+        def as_reader(self) -> Request.ModelReader: ...
+        @staticmethod
+        def write(file: BufferedWriter) -> None: ...
+        @staticmethod
+        def write_packed(file: BufferedWriter) -> None: ...
+
+    class Input:
+        keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+        descriptors: Sequence[
+            TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader
+        ]
+        def which(self) -> Literal["keys", "descriptors"]: ...
+        @staticmethod
+        @contextmanager
+        def from_bytes(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Iterator[Request.InputReader]: ...
+        @staticmethod
+        def from_bytes_packed(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Request.InputReader: ...
+        @staticmethod
+        def new_message() -> Request.InputBuilder: ...
+        def to_dict(self) -> dict: ...
+
+    class InputReader(Request.Input):
+        keys: Sequence[TensorKeyReader]
+        descriptors: Sequence[TensorDescriptorReader]
+        def as_builder(self) -> Request.InputBuilder: ...
+
+    class InputBuilder(Request.Input):
+        keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+        descriptors: Sequence[
+            TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader
+        ]
+        @staticmethod
+        def from_dict(dictionary: dict) -> Request.InputBuilder: ...
+        def copy(self) -> Request.InputBuilder: ...
+        def to_bytes(self) -> bytes: ...
+        def to_bytes_packed(self) -> bytes: ...
+        def to_segments(self) -> list[bytes]: ...
+        def as_reader(self) -> Request.InputReader: ...
+        @staticmethod
+        def write(file: BufferedWriter) -> None: ...
+        @staticmethod
+        def write_packed(file: BufferedWriter) -> None: ...
+
+    class CustomAttributes:
+        torch: (
+            TorchRequestAttributes
+            | TorchRequestAttributesBuilder
+            | TorchRequestAttributesReader
+        )
+        tf: (
+            TensorFlowRequestAttributes
+            | TensorFlowRequestAttributesBuilder
+            | TensorFlowRequestAttributesReader
+        )
+        none: None
+        def which(self) -> Literal["torch", "tf", "none"]: ...
+        @overload
+        def init(self, name: Literal["torch"]) -> TorchRequestAttributes: ...
+        @overload
+        def init(self, name: Literal["tf"]) -> TensorFlowRequestAttributes: ...
+        @staticmethod
+        @contextmanager
+        def from_bytes(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Iterator[Request.CustomAttributesReader]: ...
+        @staticmethod
+        def from_bytes_packed(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Request.CustomAttributesReader: ...
+        @staticmethod
+        def new_message() -> Request.CustomAttributesBuilder: ...
+        def to_dict(self) -> dict: ...
+
+    class CustomAttributesReader(Request.CustomAttributes):
+        torch: TorchRequestAttributesReader
+        tf: TensorFlowRequestAttributesReader
+        def as_builder(self) -> Request.CustomAttributesBuilder: ...
+
+    class CustomAttributesBuilder(Request.CustomAttributes):
+        torch: (
+            TorchRequestAttributes
+            | TorchRequestAttributesBuilder
+            | TorchRequestAttributesReader
+        )
+        tf: (
+            TensorFlowRequestAttributes
+            | TensorFlowRequestAttributesBuilder
+            | TensorFlowRequestAttributesReader
+        )
+        @staticmethod
+        def from_dict(dictionary: dict) -> Request.CustomAttributesBuilder: ...
+        def copy(self) -> Request.CustomAttributesBuilder: ...
+        def to_bytes(self) -> bytes: ...
+        def to_bytes_packed(self) -> bytes: ...
+        def to_segments(self) -> list[bytes]: ...
+        def as_reader(self) -> Request.CustomAttributesReader: ...
+        @staticmethod
+        def write(file: BufferedWriter) -> None: ...
+        @staticmethod
+        def write_packed(file: BufferedWriter) -> None: ...
+    replyChannel: ChannelDescriptor | ChannelDescriptorBuilder | ChannelDescriptorReader
+    model: Request.Model | Request.ModelBuilder | Request.ModelReader
+    input: Request.Input | Request.InputBuilder | Request.InputReader
+    output: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+    outputDescriptors: Sequence[
+        OutputDescriptor | OutputDescriptorBuilder | OutputDescriptorReader
+    ]
+    customAttributes: (
+        Request.CustomAttributes
+        | Request.CustomAttributesBuilder
+        | Request.CustomAttributesReader
+    )
+    @overload
+    def init(self, name: Literal["replyChannel"]) -> ChannelDescriptor: ...
+    @overload
+    def init(self, name: Literal["model"]) -> Model: ...
+    @overload
+    def init(self, name: Literal["input"]) -> Input: ...
+    @overload
+    def init(self, name: Literal["customAttributes"]) -> CustomAttributes: ...
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[RequestReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> RequestReader: ...
+    @staticmethod
+    def new_message() -> RequestBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class RequestReader(Request):
+    replyChannel: ChannelDescriptorReader
+    model: Request.ModelReader
+    input: Request.InputReader
+    output: Sequence[TensorKeyReader]
+    outputDescriptors: Sequence[OutputDescriptorReader]
+    customAttributes: Request.CustomAttributesReader
+    def as_builder(self) -> RequestBuilder: ...
+
+class RequestBuilder(Request):
+    replyChannel: ChannelDescriptor | ChannelDescriptorBuilder | ChannelDescriptorReader
+    model: Request.Model | Request.ModelBuilder | Request.ModelReader
+    input: Request.Input | Request.InputBuilder | Request.InputReader
+    output: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+    outputDescriptors: Sequence[
+        OutputDescriptor | OutputDescriptorBuilder | OutputDescriptorReader
+    ]
+    customAttributes: (
+        Request.CustomAttributes
+        | Request.CustomAttributesBuilder
+        | Request.CustomAttributesReader
+    )
+    @staticmethod
+    def from_dict(dictionary: dict) -> RequestBuilder: ...
+    def copy(self) -> RequestBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> RequestReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/response/response.capnp b/smartsim/_core/mli/mli_schemas/response/response.capnp
new file mode 100644
index 0000000000..7194524cd0
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response.capnp
@@ -0,0 +1,52 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0xa05dcb4444780705;
+
+using Tensors = import "../tensor/tensor.capnp";
+using ResponseAttributes = import "response_attributes/response_attributes.capnp";
+using DataRef = import "../data/data_references.capnp";
+
+enum Status {
+  complete @0;
+  fail @1;
+  timeout @2;
+  running @3;
+}
+
+struct Response {
+  status @0 :Status;
+  message @1 :Text;
+  result :union {
+    keys @2 :List(DataRef.TensorKey);
+    descriptors @3 :List(Tensors.TensorDescriptor);
+  }
+  customAttributes :union {
+    torch @4 :ResponseAttributes.TorchResponseAttributes;
+    tf @5 :ResponseAttributes.TensorFlowResponseAttributes;
+    none @6 :Void;
+  }
+}
diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp
new file mode 100644
index 0000000000..b4dcf18e88
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp
@@ -0,0 +1,33 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0xee59c60fccbb1bf9;
+
+struct TorchResponseAttributes {
+}
+
+struct TensorFlowResponseAttributes {
+}
diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py
new file mode 100644
index 0000000000..4839334d52
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py
@@ -0,0 +1,41 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `response_attributes.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "response_attributes.capnp"))
+TorchResponseAttributes = capnp.load(module_file).TorchResponseAttributes
+TorchResponseAttributesBuilder = TorchResponseAttributes
+TorchResponseAttributesReader = TorchResponseAttributes
+TensorFlowResponseAttributes = capnp.load(module_file).TensorFlowResponseAttributes
+TensorFlowResponseAttributesBuilder = TensorFlowResponseAttributes
+TensorFlowResponseAttributesReader = TensorFlowResponseAttributes
diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi
new file mode 100644
index 0000000000..f40688d74a
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi
@@ -0,0 +1,103 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `response_attributes.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator
+
+class TorchResponseAttributes:
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TorchResponseAttributesReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TorchResponseAttributesReader: ...
+    @staticmethod
+    def new_message() -> TorchResponseAttributesBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TorchResponseAttributesReader(TorchResponseAttributes):
+    def as_builder(self) -> TorchResponseAttributesBuilder: ...
+
+class TorchResponseAttributesBuilder(TorchResponseAttributes):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TorchResponseAttributesBuilder: ...
+    def copy(self) -> TorchResponseAttributesBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TorchResponseAttributesReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
+
+class TensorFlowResponseAttributes:
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TensorFlowResponseAttributesReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TensorFlowResponseAttributesReader: ...
+    @staticmethod
+    def new_message() -> TensorFlowResponseAttributesBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TensorFlowResponseAttributesReader(TensorFlowResponseAttributes):
+    def as_builder(self) -> TensorFlowResponseAttributesBuilder: ...
+
+class TensorFlowResponseAttributesBuilder(TensorFlowResponseAttributes):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TensorFlowResponseAttributesBuilder: ...
+    def copy(self) -> TensorFlowResponseAttributesBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TensorFlowResponseAttributesReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/response/response_capnp.py b/smartsim/_core/mli/mli_schemas/response/response_capnp.py
new file mode 100644
index 0000000000..eaa3451045
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response_capnp.py
@@ -0,0 +1,38 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `response.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "response.capnp"))
+Response = capnp.load(module_file).Response
+ResponseBuilder = Response
+ResponseReader = Response
diff --git a/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi
new file mode 100644
index 0000000000..6b4c50fd05
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi
@@ -0,0 +1,212 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `response.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator, Literal, Sequence, overload
+
+from ..data.data_references_capnp import TensorKey, TensorKeyBuilder, TensorKeyReader
+from ..tensor.tensor_capnp import (
+    TensorDescriptor,
+    TensorDescriptorBuilder,
+    TensorDescriptorReader,
+)
+from .response_attributes.response_attributes_capnp import (
+    TensorFlowResponseAttributes,
+    TensorFlowResponseAttributesBuilder,
+    TensorFlowResponseAttributesReader,
+    TorchResponseAttributes,
+    TorchResponseAttributesBuilder,
+    TorchResponseAttributesReader,
+)
+
+Status = Literal["complete", "fail", "timeout", "running"]
+
+class Response:
+    class Result:
+        keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+        descriptors: Sequence[
+            TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader
+        ]
+        def which(self) -> Literal["keys", "descriptors"]: ...
+        @staticmethod
+        @contextmanager
+        def from_bytes(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Iterator[Response.ResultReader]: ...
+        @staticmethod
+        def from_bytes_packed(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Response.ResultReader: ...
+        @staticmethod
+        def new_message() -> Response.ResultBuilder: ...
+        def to_dict(self) -> dict: ...
+
+    class ResultReader(Response.Result):
+        keys: Sequence[TensorKeyReader]
+        descriptors: Sequence[TensorDescriptorReader]
+        def as_builder(self) -> Response.ResultBuilder: ...
+
+    class ResultBuilder(Response.Result):
+        keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+        descriptors: Sequence[
+            TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader
+        ]
+        @staticmethod
+        def from_dict(dictionary: dict) -> Response.ResultBuilder: ...
+        def copy(self) -> Response.ResultBuilder: ...
+        def to_bytes(self) -> bytes: ...
+        def to_bytes_packed(self) -> bytes: ...
+        def to_segments(self) -> list[bytes]: ...
+        def as_reader(self) -> Response.ResultReader: ...
+        @staticmethod
+        def write(file: BufferedWriter) -> None: ...
+        @staticmethod
+        def write_packed(file: BufferedWriter) -> None: ...
+
+    class CustomAttributes:
+        torch: (
+            TorchResponseAttributes
+            | TorchResponseAttributesBuilder
+            | TorchResponseAttributesReader
+        )
+        tf: (
+            TensorFlowResponseAttributes
+            | TensorFlowResponseAttributesBuilder
+            | TensorFlowResponseAttributesReader
+        )
+        none: None
+        def which(self) -> Literal["torch", "tf", "none"]: ...
+        @overload
+        def init(self, name: Literal["torch"]) -> TorchResponseAttributes: ...
+        @overload
+        def init(self, name: Literal["tf"]) -> TensorFlowResponseAttributes: ...
+        @staticmethod
+        @contextmanager
+        def from_bytes(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Iterator[Response.CustomAttributesReader]: ...
+        @staticmethod
+        def from_bytes_packed(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Response.CustomAttributesReader: ...
+        @staticmethod
+        def new_message() -> Response.CustomAttributesBuilder: ...
+        def to_dict(self) -> dict: ...
+
+    class CustomAttributesReader(Response.CustomAttributes):
+        torch: TorchResponseAttributesReader
+        tf: TensorFlowResponseAttributesReader
+        def as_builder(self) -> Response.CustomAttributesBuilder: ...
+
+    class CustomAttributesBuilder(Response.CustomAttributes):
+        torch: (
+            TorchResponseAttributes
+            | TorchResponseAttributesBuilder
+            | TorchResponseAttributesReader
+        )
+        tf: (
+            TensorFlowResponseAttributes
+            | TensorFlowResponseAttributesBuilder
+            | TensorFlowResponseAttributesReader
+        )
+        @staticmethod
+        def from_dict(dictionary: dict) -> Response.CustomAttributesBuilder: ...
+        def copy(self) -> Response.CustomAttributesBuilder: ...
+        def to_bytes(self) -> bytes: ...
+        def to_bytes_packed(self) -> bytes: ...
+        def to_segments(self) -> list[bytes]: ...
+        def as_reader(self) -> Response.CustomAttributesReader: ...
+        @staticmethod
+        def write(file: BufferedWriter) -> None: ...
+        @staticmethod
+        def write_packed(file: BufferedWriter) -> None: ...
+    status: Status
+    message: str
+    result: Response.Result | Response.ResultBuilder | Response.ResultReader
+    customAttributes: (
+        Response.CustomAttributes
+        | Response.CustomAttributesBuilder
+        | Response.CustomAttributesReader
+    )
+    @overload
+    def init(self, name: Literal["result"]) -> Result: ...
+    @overload
+    def init(self, name: Literal["customAttributes"]) -> CustomAttributes: ...
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[ResponseReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> ResponseReader: ...
+    @staticmethod
+    def new_message() -> ResponseBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class ResponseReader(Response):
+    result: Response.ResultReader
+    customAttributes: Response.CustomAttributesReader
+    def as_builder(self) -> ResponseBuilder: ...
+
+class ResponseBuilder(Response):
+    result: Response.Result | Response.ResultBuilder | Response.ResultReader
+    customAttributes: (
+        Response.CustomAttributes
+        | Response.CustomAttributesBuilder
+        | Response.CustomAttributesReader
+    )
+    @staticmethod
+    def from_dict(dictionary: dict) -> ResponseBuilder: ...
+    def copy(self) -> ResponseBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> ResponseReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp
new file mode 100644
index 0000000000..4b2218b166
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp
@@ -0,0 +1,75 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0x9a0aeb2e04838fb1;
+
+using DataRef = import "../data/data_references.capnp";
+
+enum Order {
+  c @0; # row major (contiguous layout)
+  f @1; # column major (fortran contiguous layout)
+}
+
+enum NumericalType {
+  int8 @0;
+  int16 @1;
+  int32 @2;
+  int64 @3;
+  uInt8 @4;
+  uInt16 @5;
+  uInt32 @6;
+  uInt64 @7;
+  float32 @8; 
+  float64 @9;
+}
+
+enum ReturnNumericalType {
+  int8 @0;
+  int16 @1;
+  int32 @2;
+  int64 @3;
+  uInt8 @4;
+  uInt16 @5;
+  uInt32 @6;
+  uInt64 @7;
+  float32 @8; 
+  float64 @9;
+  none @10;
+  auto @11;
+}
+
+struct TensorDescriptor {
+  dimensions @0 :List(Int32);
+  order @1 :Order;
+  dataType @2 :NumericalType;
+}
+
+struct OutputDescriptor {
+  order @0 :Order;
+  optionalKeys @1 :List(DataRef.TensorKey);
+  optionalDimension @2 :List(Int32);
+  optionalDatatype @3 :ReturnNumericalType;
+}
diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py
new file mode 100644
index 0000000000..8c9d6c9029
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py
@@ -0,0 +1,41 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `tensor.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "tensor.capnp"))
+TensorDescriptor = capnp.load(module_file).TensorDescriptor
+TensorDescriptorBuilder = TensorDescriptor
+TensorDescriptorReader = TensorDescriptor
+OutputDescriptor = capnp.load(module_file).OutputDescriptor
+OutputDescriptorBuilder = OutputDescriptor
+OutputDescriptorReader = OutputDescriptor
diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi
new file mode 100644
index 0000000000..b55f26b452
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi
@@ -0,0 +1,142 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `tensor.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator, Literal, Sequence
+
+from ..data.data_references_capnp import TensorKey, TensorKeyBuilder, TensorKeyReader
+
+Order = Literal["c", "f"]
+NumericalType = Literal[
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uInt8",
+    "uInt16",
+    "uInt32",
+    "uInt64",
+    "float32",
+    "float64",
+]
+ReturnNumericalType = Literal[
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uInt8",
+    "uInt16",
+    "uInt32",
+    "uInt64",
+    "float32",
+    "float64",
+    "none",
+    "auto",
+]
+
+class TensorDescriptor:
+    dimensions: Sequence[int]
+    order: Order
+    dataType: NumericalType
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TensorDescriptorReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TensorDescriptorReader: ...
+    @staticmethod
+    def new_message() -> TensorDescriptorBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TensorDescriptorReader(TensorDescriptor):
+    def as_builder(self) -> TensorDescriptorBuilder: ...
+
+class TensorDescriptorBuilder(TensorDescriptor):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TensorDescriptorBuilder: ...
+    def copy(self) -> TensorDescriptorBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TensorDescriptorReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
+
+class OutputDescriptor:
+    order: Order
+    optionalKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+    optionalDimension: Sequence[int]
+    optionalDatatype: ReturnNumericalType
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[OutputDescriptorReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> OutputDescriptorReader: ...
+    @staticmethod
+    def new_message() -> OutputDescriptorBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class OutputDescriptorReader(OutputDescriptor):
+    optionalKeys: Sequence[TensorKeyReader]
+    def as_builder(self) -> OutputDescriptorBuilder: ...
+
+class OutputDescriptorBuilder(OutputDescriptor):
+    optionalKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+    @staticmethod
+    def from_dict(dictionary: dict) -> OutputDescriptorBuilder: ...
+    def copy(self) -> OutputDescriptorBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> OutputDescriptorReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/schemas/utils.py b/smartsim/_core/schemas/utils.py
index 9cb36bcf57..905fe8955c 100644
--- a/smartsim/_core/schemas/utils.py
+++ b/smartsim/_core/schemas/utils.py
@@ -48,7 +48,7 @@ class _Message(t.Generic[_SchemaT]):
     delimiter: str = pydantic.Field(min_length=1, default=_DEFAULT_MSG_DELIM)
 
     def __str__(self) -> str:
-        return self.delimiter.join((self.header, self.payload.json()))
+        return self.delimiter.join((self.header, self.payload.model_dump_json()))
 
     @classmethod
     def from_str(
@@ -58,7 +58,7 @@ def from_str(
         delimiter: str = _DEFAULT_MSG_DELIM,
     ) -> "_Message[_SchemaT]":
         header, payload = str_.split(delimiter, 1)
-        return cls(payload_type.parse_raw(payload), header, delimiter)
+        return cls(payload_type.model_validate_json(payload), header, delimiter)
 
 
 class SchemaRegistry(t.Generic[_SchemaT]):
diff --git a/smartsim/_core/shell/shell_launcher.py b/smartsim/_core/shell/shell_launcher.py
index 9c05f38f6a..530ac8a641 100644
--- a/smartsim/_core/shell/shell_launcher.py
+++ b/smartsim/_core/shell/shell_launcher.py
@@ -50,6 +50,8 @@
 
 logger = get_logger(__name__)
 
+# pylint: disable=unspecified-encoding
+
 
 class ShellLauncherCommand(t.NamedTuple):
     env: EnvironMappingType
@@ -110,14 +112,13 @@ def impl(
             else exe
         )
         # pylint: disable-next=consider-using-with
-        return ShellLauncherCommand(  # pylint: disable-next=unspecified-encoding
+        return ShellLauncherCommand(
             env, pathlib.Path(path), open(stdout_path), open(stderr_path), command_tuple
         )
 
     return impl
 
 
-# pylint: disable=no-self-use
 class ShellLauncher:
     """A launcher for launching/tracking local shell commands"""
 
diff --git a/smartsim/_core/types.py b/smartsim/_core/types.py
new file mode 100644
index 0000000000..d3dc029eaa
--- /dev/null
+++ b/smartsim/_core/types.py
@@ -0,0 +1,32 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import enum
+
+
+class Device(enum.Enum):
+    CPU = "cpu"
+    GPU = "gpu"
diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py
index 30256034cb..4159c90424 100644
--- a/smartsim/_core/utils/__init__.py
+++ b/smartsim/_core/utils/__init__.py
@@ -29,5 +29,6 @@
     colorize,
     delete_elements,
     execute_platform_cmd,
+    expand_exe_path,
     is_crayex_platform,
 )
diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py
index 04c17d04c8..e498c26209 100644
--- a/smartsim/_core/utils/helpers.py
+++ b/smartsim/_core/utils/helpers.py
@@ -32,11 +32,14 @@
 import base64
 import collections.abc
 import functools
+import itertools
 import os
 import signal
 import subprocess
+import sys
 import typing as t
 import uuid
+import warnings
 from datetime import datetime
 from shutil import which
 
@@ -52,6 +55,7 @@
     _Ts = TypeVarTuple("_Ts")
 
 
+_TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime"]
 _T = t.TypeVar("_T")
 _HashableT = t.TypeVar("_HashableT", bound=t.Hashable)
 _TSignalHandlerFn = t.Callable[[int, t.Optional["FrameType"]], object]
@@ -66,7 +70,6 @@ def unpack(value: _NestedJobSequenceType) -> t.Generator[Job, None, None]:
     :param value: Sequence containing elements of type Job or other
     sequences that are also of type _NestedJobSequenceType
     :return: flattened list of Jobs"""
-
     from smartsim.launchable.job import Job  # pylint: disable=import-outside-toplevel
 
     for item in value:
@@ -602,3 +605,47 @@ def push_unique(self, fn: _TSignalHandlerFn) -> bool:
         if did_push := fn not in self:
             self.push(fn)
         return did_push
+
+
+def _create_pinning_string(
+    pin_ids: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]], cpus: int
+) -> t.Optional[str]:
+    """Create a comma-separated string of CPU ids. By default, ``None``
+    returns 0,1,...,cpus-1; an empty iterable will disable pinning
+    altogether, and an iterable constructs a comma separated string of
+    integers (e.g. ``[0, 2, 5]`` -> ``"0,2,5"``)
+
+    :params pin_ids: CPU ids
+    :params cpu: number of CPUs
+    :raises TypeError: if pin id is not an iterable of ints
+    :returns: a comma separated string of CPU ids
+    """
+
+    try:
+        pin_ids = tuple(pin_ids) if pin_ids is not None else None
+    except TypeError:
+        raise TypeError(
+            "Expected a cpu pinning specification of type iterable of ints or "
+            f"iterables of ints. Instead got type `{type(pin_ids)}`"
+        ) from None
+
+    # Deal with MacOSX limitations first. The "None" (default) disables pinning
+    # and is equivalent to []. The only invalid option is a non-empty pinning
+    if sys.platform == "darwin":
+        if pin_ids:
+            warnings.warn(
+                "CPU pinning is not supported on MacOSX. Ignoring pinning "
+                "specification.",
+                RuntimeWarning,
+            )
+        return None
+
+    # Flatten the iterable into a list and check to make sure that the resulting
+    # elements are all ints
+    if pin_ids is None:
+        return ",".join(_stringify_id(i) for i in range(cpus))
+    if not pin_ids:
+        return None
+    pin_ids = ((x,) if isinstance(x, int) else x for x in pin_ids)
+    to_fmt = itertools.chain.from_iterable(pin_ids)
+    return ",".join(sorted({_stringify_id(x) for x in to_fmt}))
diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py
index 4725c95654..c2f70a25ab 100644
--- a/smartsim/_core/utils/serialize.py
+++ b/smartsim/_core/utils/serialize.py
@@ -33,6 +33,8 @@
 
 import smartsim._core._cli.utils as _utils
 import smartsim.log
+from smartsim.settings.batch_settings import BatchSettings
+from smartsim.settings.launch_settings import LaunchSettings
 
 if t.TYPE_CHECKING:
     from smartsim._core.control.manifest import LaunchedManifest as _Manifest
@@ -40,8 +42,6 @@
     from smartsim.database.feature_store import FeatureStore
     from smartsim.entity import Application, FSNode
     from smartsim.entity.dbobject import FSModel, FSScript
-    from smartsim.settings.batch_settings import BatchSettings
-    from smartsim.settings.launch_settings import LaunchSettings
 
 
 TStepLaunchMetaData = t.Tuple[
@@ -235,7 +235,7 @@ def _dictify_fs(
         fs_type = "Unknown"
 
     return {
-        "name": feature_store.name,
+        "name": feature_store.fs_identifier,
         "type": fs_type,
         "interface": feature_store._interfaces,  # pylint: disable=protected-access
         "shards": [
diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
new file mode 100644
index 0000000000..f99950739e
--- /dev/null
+++ b/smartsim/_core/utils/timings.py
@@ -0,0 +1,175 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import time
+import typing as t
+from collections import OrderedDict
+
+import numpy as np
+
+from ...log import get_logger
+
+logger = get_logger("PerfTimer")
+
+
+class PerfTimer:
+    def __init__(
+        self,
+        filename: str = "timings",
+        prefix: str = "",
+        timing_on: bool = True,
+        debug: bool = False,
+    ):
+        self._start: t.Optional[float] = None
+        self._interm: t.Optional[float] = None
+        self._timings: OrderedDict[str, list[t.Union[float, int, str]]] = OrderedDict()
+        self._timing_on = timing_on
+        self._filename = filename
+        self._prefix = prefix
+        self._debug = debug
+
+    def _add_label_to_timings(self, label: str) -> None:
+        if label not in self._timings:
+            self._timings[label] = []
+
+    @staticmethod
+    def _format_number(number: t.Union[float, int]) -> str:
+        """Formats the input value with a fixed precision appropriate for logging"""
+        return f"{number:0.4e}"
+
+    def start_timings(
+        self,
+        first_label: t.Optional[str] = None,
+        first_value: t.Optional[t.Union[float, int]] = None,
+    ) -> None:
+        """Start a recording session by recording
+
+        :param first_label: a label for an event that will be manually prepended
+        to the timing information before starting timers
+        :param first_label: a value for an event that will be manually prepended
+        to the timing information before starting timers"""
+        if self._timing_on:
+            if first_label is not None and first_value is not None:
+                mod_label = self._make_label(first_label)
+                value = self._format_number(first_value)
+                self._log(f"Started timing: {first_label}: {value}")
+                self._add_label_to_timings(mod_label)
+                self._timings[mod_label].append(value)
+            self._start = time.perf_counter()
+            self._interm = time.perf_counter()
+
+    def end_timings(self) -> None:
+        """Record a timing event and clear the last checkpoint"""
+        if self._timing_on and self._start is not None:
+            mod_label = self._make_label("total_time")
+            self._add_label_to_timings(mod_label)
+            delta = self._format_number(time.perf_counter() - self._start)
+            self._timings[self._make_label("total_time")].append(delta)
+            self._log(f"Finished timing: {mod_label}: {delta}")
+            self._interm = None
+
+    def _make_label(self, label: str) -> str:
+        """Return a label formatted with the current label prefix
+
+        :param label: the original label
+        :returns: the adjusted label value"""
+        return self._prefix + label
+
+    def _get_delta(self) -> float:
+        """Calculates the offset from the last intermediate checkpoint time
+
+        :returns: the number of seconds elapsed"""
+        if self._interm is None:
+            return 0
+        return time.perf_counter() - self._interm
+
+    def get_last(self, label: str) -> str:
+        """Return the last timing value collected for the given label in
+        the format `{label}: {value}`. If no timing value has been collected
+        with the label, returns `Not measured yet`"""
+        mod_label = self._make_label(label)
+        if mod_label in self._timings:
+            value = self._timings[mod_label][-1]
+            if value:
+                return f"{label}: {value}"
+
+        return "Not measured yet"
+
+    def measure_time(self, label: str) -> None:
+        """Record a new time event if timing is enabled
+
+        :param label: the label to record a timing event for"""
+        if self._timing_on and self._interm is not None:
+            mod_label = self._make_label(label)
+            self._add_label_to_timings(mod_label)
+            delta = self._format_number(self._get_delta())
+            self._timings[mod_label].append(delta)
+            self._log(f"{mod_label}: {delta}")
+            self._interm = time.perf_counter()
+
+    def _log(self, msg: str) -> None:
+        """Conditionally logs a message when the debug flag is enabled
+
+        :param msg: the message to be logged"""
+        if self._debug:
+            logger.info(msg)
+
+    @property
+    def max_length(self) -> int:
+        """Returns the number of records contained in the largest timing set"""
+        if len(self._timings) == 0:
+            return 0
+        return max(len(value) for value in self._timings.values())
+
+    def print_timings(self, to_file: bool = False) -> None:
+        """Print timing information to standard output. If `to_file`
+        is `True`, also write results to a file.
+
+        :param to_file: If `True`, also saves timing information
+         to the files `timings.npy` and `timings.txt`
+        """
+        print(" ".join(self._timings.keys()))
+        try:
+            value_array = np.array(list(self._timings.values()), dtype=float)
+        except Exception as e:
+            logger.exception(e)
+            return
+        value_array = np.transpose(value_array)
+        if self._debug:
+            for i in range(value_array.shape[0]):
+                print(" ".join(self._format_number(value) for value in value_array[i]))
+        if to_file:
+            np.save(self._prefix + self._filename + ".npy", value_array)
+
+    @property
+    def is_active(self) -> bool:
+        """Return `True` if timer is recording, `False` otherwise"""
+        return self._timing_on
+
+    @is_active.setter
+    def is_active(self, active: bool) -> None:
+        """Set to `True` to record timing information, `False` otherwise"""
+        self._timing_on = active
diff --git a/smartsim/entity/_mock.py b/smartsim/entity/_mock.py
index 8f1043ed3c..7b9c43c5c8 100644
--- a/smartsim/entity/_mock.py
+++ b/smartsim/entity/_mock.py
@@ -34,6 +34,18 @@
 
 import typing as t
 
+import pytest
+
+from smartsim._core.mli.infrastructure.control.worker_manager import build_failure_reply
+
+dragon = pytest.importorskip("dragon")
+
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
 
 class Mock:
     """Base mock class"""
@@ -44,3 +56,28 @@ def __getattr__(self, _: str) -> Mock:
 
     def __deepcopy__(self, _: dict[t.Any, t.Any]) -> Mock:
         return type(self)()
+
+
+@pytest.mark.parametrize(
+    "status, message",
+    [
+        pytest.param("timeout", "Worker timed out", id="timeout"),
+        pytest.param("fail", "Failed while executing", id="fail"),
+    ],
+)
+def test_build_failure_reply(status: "Status", message: str):
+    "Ensures failure replies can be built successfully"
+    response = build_failure_reply(status, message)
+    display_name = response.schema.node.displayName  # type: ignore
+    class_name = display_name.split(":")[-1]
+    assert class_name == "Response"
+    assert response.status == status
+    assert response.message == message
+
+
+def test_build_failure_reply_fails():
+    "Ensures ValueError is raised if a Status Enum is not used"
+    with pytest.raises(ValueError) as ex:
+        build_failure_reply("not a status enum", "message")
+
+    assert "Error assigning status to response" in ex.value.args[0]
diff --git a/smartsim/entity/application.py b/smartsim/entity/application.py
index da8ec052cf..3dd37d4afe 100644
--- a/smartsim/entity/application.py
+++ b/smartsim/entity/application.py
@@ -28,14 +28,11 @@
 
 import collections
 import copy
-import itertools
-import sys
 import textwrap
 import typing as t
-import warnings
 
 from .._core.generation.operations.operations import FileSysOperationSet
-from .._core.utils.helpers import _stringify_id, expand_exe_path
+from .._core.utils.helpers import expand_exe_path
 from ..log import get_logger
 from .entity import SmartSimEntity
 
@@ -219,11 +216,10 @@ def key_prefixing_enabled(self, value: bool) -> None:
         self.key_prefixing_enabled = copy.deepcopy(value)
 
     def as_executable_sequence(self) -> t.Sequence[str]:
-        """Converts the executable and its arguments into a sequence
-        of program arguments.
+        """Converts the executable and its arguments into a sequence of program
+        arguments.
 
-        :return: a sequence of strings representing the executable and
-        its arguments
+        :return: a sequence of strings representing the executable and its arguments
         """
         return [self.exe, *self.exe_args]
 
@@ -251,50 +247,6 @@ def _build_exe_args(exe_args: t.Union[str, t.Sequence[str], None]) -> t.List[str
 
         return list(exe_args)
 
-    @staticmethod
-    def _create_pinning_string(
-        pin_ids: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]], cpus: int
-    ) -> t.Optional[str]:
-        """Create a comma-separated string of CPU ids. By default, ``None``
-        returns 0,1,...,cpus-1; an empty iterable will disable pinning
-        altogether, and an iterable constructs a comma separated string of
-        integers (e.g. ``[0, 2, 5]`` -> ``"0,2,5"``)
-
-        :params pin_ids: CPU ids
-        :params cpu: number of CPUs
-        :raises TypeError: if pin id is not an iterable of ints
-        :returns: a comma separated string of CPU ids
-        """
-
-        try:
-            pin_ids = tuple(pin_ids) if pin_ids is not None else None
-        except TypeError:
-            raise TypeError(
-                "Expected a cpu pinning specification of type iterable of ints or "
-                f"iterables of ints. Instead got type `{type(pin_ids)}`"
-            ) from None
-
-        # Deal with MacOSX limitations first. The "None" (default) disables pinning
-        # and is equivalent to []. The only invalid option is a non-empty pinning
-        if sys.platform == "darwin":
-            if pin_ids:
-                warnings.warn(
-                    "CPU pinning is not supported on MacOSX. Ignoring pinning "
-                    "specification.",
-                    RuntimeWarning,
-                )
-            return None
-
-        # Flatten the iterable into a list and check to make sure that the resulting
-        # elements are all ints
-        if pin_ids is None:
-            return ",".join(_stringify_id(i) for i in range(cpus))
-        if not pin_ids:
-            return None
-        pin_ids = ((x,) if isinstance(x, int) else x for x in pin_ids)
-        to_fmt = itertools.chain.from_iterable(pin_ids)
-        return ",".join(sorted({_stringify_id(x) for x in to_fmt}))
-
     def __str__(self) -> str:  # pragma: no cover
         exe_args_str = "\n".join(self.exe_args)
         entities_str = "\n".join(str(entity) for entity in self.incoming_entities)
diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py
index f82aeea183..477564e83d 100644
--- a/smartsim/entity/dbobject.py
+++ b/smartsim/entity/dbobject.py
@@ -27,7 +27,8 @@
 import typing as t
 from pathlib import Path
 
-from .._core._install.builder import Device
+from smartsim._core.types import Device
+
 from ..error import SSUnsupportedError
 
 __all__ = ["FSObject", "FSModel", "FSScript"]
diff --git a/smartsim/experiment.py b/smartsim/experiment.py
index 4dc99975e9..9e6a657d90 100644
--- a/smartsim/experiment.py
+++ b/smartsim/experiment.py
@@ -157,6 +157,13 @@ def __init__(self, name: str, exp_path: str | None = None):
         experiment
         """
 
+    def _set_dragon_server_path(self) -> None:
+        """Set path for dragon server through environment varialbes"""
+        if not "SMARTSIM_DRAGON_SERVER_PATH" in environ:
+            environ["_SMARTSIM_DRAGON_SERVER_PATH_EXP"] = osp.join(
+                self.exp_path, CONFIG.dragon_default_subdir
+            )
+
     def start(self, *jobs: Job | t.Sequence[Job]) -> tuple[LaunchedJobID, ...]:
         """Execute a collection of `Job` instances.
 
@@ -175,7 +182,7 @@ def start(self, *jobs: Job | t.Sequence[Job]) -> tuple[LaunchedJobID, ...]:
         jobs_ = list(_helpers.unpack(jobs))
 
         run_id = datetime.datetime.now().replace(microsecond=0).isoformat()
-        root = pathlib.Path(self.exp_path, run_id)
+        root = pathlib.Path(self.exp_path, run_id.replace(":", "."))
         return self._dispatch(Generator(root), dispatch.DEFAULT_DISPATCHER, *jobs_)
 
     def _dispatch(
@@ -202,18 +209,18 @@ def execute_dispatch(generator: Generator, job: Job, idx: int) -> LaunchedJobID:
             args = job.launch_settings.launch_args
             env = job.launch_settings.env_vars
             exe = job.entity.as_executable_sequence()
-            dispatch_instance = dispatcher.get_dispatch(args)
+            dispatch_item = dispatcher.get_dispatch(args)
             try:
                 # Check to see if one of the existing launchers can be
                 # configured to handle the launch arguments ...
-                launch_config = dispatch_instance.configure_first_compatible_launcher(
+                launch_config = dispatch_item.configure_first_compatible_launcher(
                     from_available_launchers=self._launch_history.iter_past_launchers(),
                     with_arguments=args,
                 )
             except errors.LauncherNotFoundError:
                 # ... otherwise create a new launcher that _can_ handle the
                 # launch arguments and configure _that_ one
-                launch_config = dispatch_instance.create_new_launcher_configuration(
+                launch_config = dispatch_item.create_new_launcher_configuration(
                     for_experiment=self, with_arguments=args
                 )
             # Generate the job directory and return the generated job path
@@ -483,8 +490,8 @@ def _append_to_fs_identifier_list(self, fs_identifier: str) -> None:
         if fs_identifier in self._fs_identifiers:
             logger.warning(
                 f"A feature store with the identifier {fs_identifier} has already "
-                "been made. An error will be raised if multiple Feature Stores "
-                "are started with the same identifier"
+                "been made. An error will be raised if multiple Feature Stores are "
+                "with the same identifier"
             )
         # Otherwise, add
         self._fs_identifiers.add(fs_identifier)
diff --git a/smartsim/launchable/mpmd_job.py b/smartsim/launchable/mpmd_job.py
index ab2aa2db6b..de9545032b 100644
--- a/smartsim/launchable/mpmd_job.py
+++ b/smartsim/launchable/mpmd_job.py
@@ -60,7 +60,7 @@ def _check_entity(mpmd_pairs: t.List[MPMDPair]) -> None:
     ret: SmartSimEntity | None = None
     for mpmd_pair in mpmd_pairs:
         if flag == 1:
-            if isinstance(ret, type(mpmd_pair.entity)):
+            if type(ret) == type(mpmd_pair.entity):
                 flag = 0
             else:
                 raise SSUnsupportedError(
@@ -108,7 +108,6 @@ def get_launch_steps(self) -> LaunchCommands:
         # TODO: return MPMDJobWarehouseRunner.run(self)
         raise NotImplementedError
 
-    # pylint: disable=unnecessary-lambda-assignment
     def __str__(self) -> str:  # pragma: no cover
         """returns A user-readable string of a MPMD Job"""
         fmt = lambda mpmd_pair: textwrap.dedent(
diff --git a/smartsim/log.py b/smartsim/log.py
index 3d6c0860ee..c8fed9329f 100644
--- a/smartsim/log.py
+++ b/smartsim/log.py
@@ -252,16 +252,21 @@ def filter(self, record: logging.LogRecord) -> bool:
         return record.levelno <= level_no
 
 
-def log_to_file(filename: str, log_level: str = "debug") -> None:
+def log_to_file(
+    filename: str, log_level: str = "debug", logger: t.Optional[logging.Logger] = None
+) -> None:
     """Installs a second filestream handler to the root logger,
     allowing subsequent logging calls to be sent to filename.
 
-    :param filename: the name of the desired log file.
-    :param log_level: as defined in get_logger.  Can be specified
+    :param filename: The name of the desired log file.
+    :param log_level: As defined in get_logger.  Can be specified
                       to allow the file to store more or less verbose
                       logging information.
+    :param logger: If supplied, a logger to add the file stream logging
+    behavior to. By default, a new logger is instantiated.
     """
-    logger = logging.getLogger("SmartSim")
+    if logger is None:
+        logger = logging.getLogger("SmartSim")
     stream = open(  # pylint: disable=consider-using-with
         filename, "w+", encoding="utf-8"
     )
diff --git a/smartsim/ml/tf/__init__.py b/smartsim/ml/tf/__init__.py
index 46d89d7336..ee791ea985 100644
--- a/smartsim/ml/tf/__init__.py
+++ b/smartsim/ml/tf/__init__.py
@@ -31,23 +31,12 @@
 logger = get_logger(__name__)
 
 vers = Versioner()
-TF_VERSION = vers.TENSORFLOW
 
 try:
     import tensorflow as tf
 except ImportError:  # pragma: no cover
     raise ModuleNotFoundError(
-        f"TensorFlow {TF_VERSION} is not installed. "
-        "Please install it to use smartsim.ml.tf"
-    ) from None
-
-try:
-    installed_tf = Version_(tf.__version__)
-    assert installed_tf >= TF_VERSION
-except AssertionError:  # pragma: no cover
-    raise SmartSimError(
-        f"TensorFlow >= {TF_VERSION} is required for smartsim. "
-        f"tf, you have {tf.__version__}"
+        f"TensorFlow is not installed. Please install it to use smartsim.ml.tf"
     ) from None
 
 
diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py
index dc66c3b55a..74e39d35b2 100644
--- a/smartsim/ml/tf/utils.py
+++ b/smartsim/ml/tf/utils.py
@@ -29,7 +29,7 @@
 
 import keras
 import tensorflow as tf
-from tensorflow.python.framework.convert_to_constants import (
+from tensorflow.python.framework.convert_to_constants import (  # type: ignore[import-not-found,unused-ignore]
     convert_variables_to_constants_v2,
 )
 
@@ -58,7 +58,7 @@ def freeze_model(
         tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype)
     )
 
-    frozen_func = convert_variables_to_constants_v2(full_model)
+    frozen_func = convert_variables_to_constants_v2(full_model)  # type: ignore[no-untyped-call,unused-ignore]
     frozen_func.graph.as_graph_def()
 
     input_names = [x.name.split(":")[0] for x in frozen_func.inputs]
@@ -89,7 +89,7 @@ def serialize_model(model: keras.Model) -> t.Tuple[str, t.List[str], t.List[str]
         tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype)
     )
 
-    frozen_func = convert_variables_to_constants_v2(full_model)
+    frozen_func = convert_variables_to_constants_v2(full_model)  # type: ignore[no-untyped-call,unused-ignore]
     frozen_func.graph.as_graph_def()
 
     input_names = [x.name.split(":")[0] for x in frozen_func.inputs]
diff --git a/smartsim/settings/arguments/launch/dragon.py b/smartsim/settings/arguments/launch/dragon.py
index 17874455ca..e58cf69b1b 100644
--- a/smartsim/settings/arguments/launch/dragon.py
+++ b/smartsim/settings/arguments/launch/dragon.py
@@ -86,6 +86,24 @@ def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None:
             raise TypeError("feature_list must be string or list of strings")
         self.set("node-feature", ",".join(feature_list))
 
+    def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None:
+        """Specify the hostlist for this job
+
+        :param host_list: hosts to launch on
+        :raises ValueError: if an empty host list is supplied
+        """
+        if not host_list:
+            raise ValueError("empty hostlist provided")
+
+        if isinstance(host_list, str):
+            host_list = host_list.replace(" ", "").split(",")
+
+        # strip out all whitespace-only values
+        cleaned_list = [host.strip() for host in host_list if host and host.strip()]
+        if not len(cleaned_list) == len(host_list):
+            raise ValueError(f"invalid names found in hostlist: {host_list}")
+        self.set("host-list", ",".join(cleaned_list))
+
     def set_cpu_affinity(self, devices: t.List[int]) -> None:
         """Set the CPU affinity for this job
 
diff --git a/smartsim/settings/batch_settings.py b/smartsim/settings/batch_settings.py
index 7489fa8edd..61b69ca8e7 100644
--- a/smartsim/settings/batch_settings.py
+++ b/smartsim/settings/batch_settings.py
@@ -65,7 +65,8 @@ class BatchSettings(BaseSettings):
     def __init__(
         self,
         batch_scheduler: t.Union[BatchSchedulerType, str],
-        batch_args: StringArgument = None,
+        batch_args: StringArgument | None = None,
+        # batch_args: StringArgument = None,
         env_vars: StringArgument | None = None,
     ) -> None:
         """Initialize a BatchSettings instance.
@@ -82,9 +83,9 @@ def __init__(
             # OR
             sbatch_settings = BatchSettings(batch_scheduler=BatchSchedulerType.Slurm)
 
-        This will assign a SlurmBatchArguments object to ``sbatch_settings.batch_args``.
-        Using the object, users may access the child class functions to set
-        batch configurations. For example:
+        This will assign a SlurmBatchArguments object to
+        ``sbatch_settings.batch_args``. Using the object, users may access the child
+        class functions to set batch configurations. For example:
 
         .. highlight:: python
         .. code-block:: python
@@ -105,9 +106,9 @@ def __init__(
 
         :param batch_scheduler: The type of scheduler to initialize
             (e.g., Slurm, PBS, LSF)
-        :param batch_args: A dictionary of arguments for the scheduler, where the keys
-            are strings and the values can be either strings or None. This argument is
-            optional and defaults to None.
+        :param batch_args: A dictionary of arguments for the scheduler, where
+            the keys are strings and the values can be either strings or None.
+            This argument is optional and defaults to None.
         :param env_vars: Environment variables for the batch settings, where the keys
             are strings and the values can be either strings or None. This argument is
             also optional and defaults to None.
@@ -122,7 +123,6 @@ def __init__(
         """The BatchSettings child class based on scheduler type"""
         self.env_vars = env_vars or {}
         """The environment configuration"""
-        self.batch_args = batch_args or {}
 
     @property
     def batch_scheduler(self) -> str:
diff --git a/smartsim/settings/common.py b/smartsim/settings/common.py
index 1d58da90b3..df7eb243aa 100644
--- a/smartsim/settings/common.py
+++ b/smartsim/settings/common.py
@@ -44,7 +44,7 @@ def set_check_input(key: str, value: t.Optional[str]) -> None:
     if key.startswith("-"):
         key = key.lstrip("-")
         logger.warning(
-            "One or more leading `-` characters were provided to \
-            the run argument. Leading dashes were stripped and \
-            the arguments were passed to the run_command."
+            "One or more leading `-` characters were provided to the run argument.\n"
+            "Leading dashes were stripped and the arguments were passed to the \n"
+            "run_command."
         )
diff --git a/smartsim/settings/launch_settings.py b/smartsim/settings/launch_settings.py
index 3f878f59dd..136de7638b 100644
--- a/smartsim/settings/launch_settings.py
+++ b/smartsim/settings/launch_settings.py
@@ -114,8 +114,8 @@ def __init__(
         :param launcher: The type of launcher to initialize (e.g., Dragon, Slurm,
             PALS, ALPS, Local, Mpiexec, Mpirun, Orterun, LSF)
         :param launch_args: A dictionary of arguments for the launcher, where the keys
-            are strings and the values can be either strings or None.
-            This argument is optional and defaults to None.
+            are strings and the values can be either strings or None. This argument is
+            optional and defaults to None.
         :param env_vars: Environment variables for the launch settings, where the keys
             are strings and the values can be either strings or None. This argument is
             also optional and defaults to None.
diff --git a/smartsim/settings/sge_settings.py b/smartsim/settings/sge_settings.py
index 72dbbf5ce2..757d167c64 100644
--- a/smartsim/settings/sge_settings.py
+++ b/smartsim/settings/sge_settings.py
@@ -36,7 +36,7 @@
 # ***************************************
 # TODO: Remove pylint disable after merge
 # ***************************************
-# pylint: disable=no-self-use
+# pylint: disable=no-self-use,no-member
 
 
 class SgeQsubBatchSettings(BatchSettings):
diff --git a/tests/_legacy/backends/run_torch.py b/tests/_legacy/backends/run_torch.py
index 83c8a9a8e7..1071e740ef 100644
--- a/tests/_legacy/backends/run_torch.py
+++ b/tests/_legacy/backends/run_torch.py
@@ -25,6 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import io
+import typing as t
 
 import numpy as np
 import torch
@@ -74,7 +75,7 @@ def calc_svd(input_tensor):
     return input_tensor.svd()
 
 
-def run(device):
+def run(device: str, num_devices: int) -> t.Any:
     # connect a client to the feature store
     client = Client(cluster=False)
 
@@ -92,9 +93,23 @@ def run(device):
     net = create_torch_model()
     # 20 samples of "image" data
     example_forward_input = torch.rand(20, 1, 28, 28)
-    client.set_model("cnn", net, "TORCH", device=device)
     client.put_tensor("input", example_forward_input.numpy())
-    client.run_model("cnn", inputs=["input"], outputs=["output"])
+    if device == "CPU":
+        client.set_model("cnn", net, "TORCH", device=device)
+        client.run_model("cnn", inputs=["input"], outputs=["output"])
+    else:
+        client.set_model_multigpu(
+            "cnn", net, "TORCH", first_gpu=0, num_gpus=num_devices
+        )
+        client.run_model_multigpu(
+            "cnn",
+            offset=1,
+            first_gpu=0,
+            num_gpus=num_devices,
+            inputs=["input"],
+            outputs=["output"],
+        )
+
     output = client.get_tensor("output")
     print(f"Prediction: {output}")
 
@@ -106,5 +121,11 @@ def run(device):
     parser.add_argument(
         "--device", type=str, default="CPU", help="device type for model execution"
     )
+    parser.add_argument(
+        "--num-devices",
+        type=int,
+        default=1,
+        help="Number of devices to set the model on",
+    )
     args = parser.parse_args()
-    run(args.device)
+    run(args.device, args.num_devices)
diff --git a/tests/_legacy/backends/test_cli_mini_exp.py b/tests/_legacy/backends/test_cli_mini_exp.py
index 1fd1107215..83ecfc5b07 100644
--- a/tests/_legacy/backends/test_cli_mini_exp.py
+++ b/tests/_legacy/backends/test_cli_mini_exp.py
@@ -32,7 +32,7 @@
 
 import smartsim._core._cli.validate
 import smartsim._core._install.builder as build
-from smartsim._core.utils.helpers import installed_redisai_backends
+from smartsim._core._install.platform import Device
 
 sklearn_available = True
 try:
@@ -70,7 +70,7 @@ def _mock_make_managed_local_feature_store(*a, **kw):
         "_make_managed_local_feature_store",
         _mock_make_managed_local_feature_store,
     )
-    backends = installed_redisai_backends()
+    backends = []  # todo: update test to replace installed_redisai_backends()
     (fs_port,) = fs.ports
 
     smartsim._core._cli.validate.test_install(
@@ -79,7 +79,7 @@ def _mock_make_managed_local_feature_store(*a, **kw):
         location=test_dir,
         port=fs_port,
         # Always test on CPU, heads don't always have GPU
-        device=build.Device.CPU,
+        device=Device.CPU,
         # Test the backends the dev has installed
         with_tf="tensorflow" in backends,
         with_pt="torch" in backends,
diff --git a/tests/_legacy/backends/test_dbmodel.py b/tests/_legacy/backends/test_dbmodel.py
index 5c9a253c75..da495004fa 100644
--- a/tests/_legacy/backends/test_dbmodel.py
+++ b/tests/_legacy/backends/test_dbmodel.py
@@ -30,7 +30,6 @@
 import pytest
 
 from smartsim import Experiment
-from smartsim._core.utils import installed_redisai_backends
 from smartsim.entity import Ensemble
 from smartsim.entity.dbobject import FSModel
 from smartsim.error.errors import SSUnsupportedError
@@ -70,7 +69,9 @@ def call(self, x):
         except:
             logger.warning("Could not set TF max memory limit for GPU")
 
-should_run_tf &= "tensorflow" in installed_redisai_backends()
+should_run_tf &= (
+    "tensorflow" in []
+)  # todo: update test to replace installed_redisai_backends()
 
 # Check if PyTorch is available for tests
 try:
@@ -107,7 +108,9 @@ def forward(self, x):
             return output
 
 
-should_run_pt &= "torch" in installed_redisai_backends()
+should_run_pt &= (
+    "torch" in []
+)  # todo: update test to replace installed_redisai_backends()
 
 
 def save_tf_cnn(path, file_name):
diff --git a/tests/_legacy/backends/test_dbscript.py b/tests/_legacy/backends/test_dbscript.py
index 9619b0325f..ec6e2f861c 100644
--- a/tests/_legacy/backends/test_dbscript.py
+++ b/tests/_legacy/backends/test_dbscript.py
@@ -24,18 +24,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
 import sys
 
 import pytest
 from smartredis import *
 
 from smartsim import Experiment
-from smartsim._core.utils import installed_redisai_backends
 from smartsim.entity.dbobject import FSScript
 from smartsim.error.errors import SSUnsupportedError
 from smartsim.log import get_logger
-from smartsim.settings import MpiexecSettings, MpirunSettings
 from smartsim.status import JobStatus
 
 logger = get_logger(__name__)
@@ -49,7 +46,7 @@
 except ImportError:
     should_run = False
 
-should_run &= "torch" in installed_redisai_backends()
+should_run &= "torch" in []  # todo: update test to replace installed_redisai_backends()
 
 
 def timestwo(x):
diff --git a/tests/_legacy/backends/test_onnx.py b/tests/_legacy/backends/test_onnx.py
index 3580ec07e3..67c9775aa3 100644
--- a/tests/_legacy/backends/test_onnx.py
+++ b/tests/_legacy/backends/test_onnx.py
@@ -30,8 +30,6 @@
 
 import pytest
 
-from smartsim import Experiment
-from smartsim._core.utils import installed_redisai_backends
 from smartsim.status import JobStatus
 
 sklearn_available = True
@@ -47,7 +45,9 @@
     sklearn_available = False
 
 
-onnx_backend_available = "onnxruntime" in installed_redisai_backends()
+onnx_backend_available = (
+    "onnxruntime" in []
+)  # todo: update test to replace installed_redisai_backends()
 
 should_run = sklearn_available and onnx_backend_available
 
diff --git a/tests/_legacy/backends/test_tf.py b/tests/_legacy/backends/test_tf.py
index 320fe84721..526c08e29e 100644
--- a/tests/_legacy/backends/test_tf.py
+++ b/tests/_legacy/backends/test_tf.py
@@ -29,8 +29,6 @@
 
 import pytest
 
-from smartsim import Experiment
-from smartsim._core.utils import installed_redisai_backends
 from smartsim.error import SmartSimError
 from smartsim.status import JobStatus
 
@@ -43,7 +41,9 @@
     print(e)
     tf_available = False
 
-tf_backend_available = "tensorflow" in installed_redisai_backends()
+tf_backend_available = (
+    "tensorflow" in []
+)  # todo: update test to replace installed_redisai_backends()
 
 
 @pytest.mark.skipif(
diff --git a/tests/_legacy/backends/test_torch.py b/tests/_legacy/backends/test_torch.py
index 2bf6c741a4..2606d08837 100644
--- a/tests/_legacy/backends/test_torch.py
+++ b/tests/_legacy/backends/test_torch.py
@@ -29,8 +29,6 @@
 
 import pytest
 
-from smartsim import Experiment
-from smartsim._core.utils import installed_redisai_backends
 from smartsim.status import JobStatus
 
 torch_available = True
@@ -40,7 +38,9 @@
 except ImportError:
     torch_available = False
 
-torch_backend_available = "torch" in installed_redisai_backends()
+torch_backend_available = (
+    "torch" in []
+)  # todo: update test to replace installed_redisai_backends()
 
 should_run = torch_available and torch_backend_available
 pytestmark = pytest.mark.skipif(
@@ -65,9 +65,11 @@ def test_torch_model_and_script(
     fs = prepare_fs(single_fs).featurestore
     wlm_experiment.reconnect_feature_store(fs.checkpoint_file)
     test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1
 
     run_settings = wlm_experiment.create_run_settings(
-        "python", f"run_torch.py --device={test_device}"
+        "python",
+        ["run_torch.py", f"--device={test_device}", f"--num-devices={test_num_gpus}"],
     )
     if wlmutils.get_test_launcher() != "local":
         run_settings.set_tasks(1)
diff --git a/tests/_legacy/install/test_build.py b/tests/_legacy/install/test_build.py
new file mode 100644
index 0000000000..f8a5c4896b
--- /dev/null
+++ b/tests/_legacy/install/test_build.py
@@ -0,0 +1,148 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import operator
+
+import pytest
+
+from smartsim._core._cli.build import parse_requirement
+from smartsim._core._install.buildenv import Version_
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+
+_SUPPORTED_OPERATORS = ("==", ">=", ">", "<=", "<")
+
+
+@pytest.mark.parametrize(
+    "spec, name, pin",
+    (
+        pytest.param("foo", "foo", None, id="Just Name"),
+        pytest.param("foo==1", "foo", "==1", id="With Major"),
+        pytest.param("foo==1.2", "foo", "==1.2", id="With Minor"),
+        pytest.param("foo==1.2.3", "foo", "==1.2.3", id="With Patch"),
+        pytest.param("foo[with-extras]==1.2.3", "foo", "==1.2.3", id="With Extra"),
+        pytest.param(
+            "foo[with,many,extras]==1.2.3", "foo", "==1.2.3", id="With Many Extras"
+        ),
+        *(
+            pytest.param(
+                f"foo{symbol}1.2.3{tag}",
+                "foo",
+                f"{symbol}1.2.3{tag}",
+                id=f"{symbol=} | {tag=}",
+            )
+            for symbol in _SUPPORTED_OPERATORS
+            for tag in ("", "+cuda", "+rocm", "+cpu")
+        ),
+    ),
+)
+def test_parse_requirement_name_and_version(spec, name, pin):
+    p_name, p_pin, _ = parse_requirement(spec)
+    assert p_name == name
+    assert p_pin == pin
+
+
+# fmt: off
+@pytest.mark.parametrize(
+    "spec, ver, should_pass",
+    (
+        pytest.param("foo"            , Version_("1.2.3")     ,  True, id="No spec"),
+        # EQ --------------------------------------------------------------------------
+        pytest.param("foo==1.2.3"     , Version_("1.2.3")     ,  True, id="EQ Spec, EQ Version"),
+        pytest.param("foo==1.2.3"     , Version_("1.2.5")     , False, id="EQ Spec, GT Version"),
+        pytest.param("foo==1.2.3"     , Version_("1.2.2")     , False, id="EQ Spec, LT Version"),
+        pytest.param("foo==1.2.3+rocm", Version_("1.2.3+rocm"),  True, id="EQ Spec, Compatible Version with suffix"),
+        pytest.param("foo==1.2.3"     , Version_("1.2.3+cuda"), False, id="EQ Spec, Compatible Version, Extra Suffix"),
+        pytest.param("foo==1.2.3+cuda", Version_("1.2.3")     , False, id="EQ Spec, Compatible Version, Missing Suffix"),
+        pytest.param("foo==1.2.3+cuda", Version_("1.2.3+rocm"), False, id="EQ Spec, Compatible Version, Mismatched Suffix"),
+        # LT --------------------------------------------------------------------------
+        pytest.param("foo<1.2.3"      , Version_("1.2.3")     , False, id="LT Spec, EQ Version"),
+        pytest.param("foo<1.2.3"      , Version_("1.2.5")     , False, id="LT Spec, GT Version"),
+        pytest.param("foo<1.2.3"      , Version_("1.2.2")     ,  True, id="LT Spec, LT Version"),
+        pytest.param("foo<1.2.3+rocm" , Version_("1.2.2+rocm"),  True, id="LT Spec, Compatible Version with suffix"),
+        pytest.param("foo<1.2.3"      , Version_("1.2.2+cuda"), False, id="LT Spec, Compatible Version, Extra Suffix"),
+        pytest.param("foo<1.2.3+cuda" , Version_("1.2.2")     , False, id="LT Spec, Compatible Version, Missing Suffix"),
+        pytest.param("foo<1.2.3+cuda" , Version_("1.2.2+rocm"), False, id="LT Spec, Compatible Version, Mismatched Suffix"),
+        # LE --------------------------------------------------------------------------
+        pytest.param("foo<=1.2.3"     , Version_("1.2.3")     ,  True, id="LE Spec, EQ Version"),
+        pytest.param("foo<=1.2.3"     , Version_("1.2.5")     , False, id="LE Spec, GT Version"),
+        pytest.param("foo<=1.2.3"     , Version_("1.2.2")     ,  True, id="LE Spec, LT Version"),
+        pytest.param("foo<=1.2.3+rocm", Version_("1.2.3+rocm"),  True, id="LE Spec, Compatible Version with suffix"),
+        pytest.param("foo<=1.2.3"     , Version_("1.2.3+cuda"), False, id="LE Spec, Compatible Version, Extra Suffix"),
+        pytest.param("foo<=1.2.3+cuda", Version_("1.2.3")     , False, id="LE Spec, Compatible Version, Missing Suffix"),
+        pytest.param("foo<=1.2.3+cuda", Version_("1.2.3+rocm"), False, id="LE Spec, Compatible Version, Mismatched Suffix"),
+        # GT --------------------------------------------------------------------------
+        pytest.param("foo>1.2.3"      , Version_("1.2.3")     , False, id="GT Spec, EQ Version"),
+        pytest.param("foo>1.2.3"      , Version_("1.2.5")     ,  True, id="GT Spec, GT Version"),
+        pytest.param("foo>1.2.3"      , Version_("1.2.2")     , False, id="GT Spec, LT Version"),
+        pytest.param("foo>1.2.3+rocm" , Version_("1.2.4+rocm"),  True, id="GT Spec, Compatible Version with suffix"),
+        pytest.param("foo>1.2.3"      , Version_("1.2.4+cuda"), False, id="GT Spec, Compatible Version, Extra Suffix"),
+        pytest.param("foo>1.2.3+cuda" , Version_("1.2.4")     , False, id="GT Spec, Compatible Version, Missing Suffix"),
+        pytest.param("foo>1.2.3+cuda" , Version_("1.2.4+rocm"), False, id="GT Spec, Compatible Version, Mismatched Suffix"),
+        # GE --------------------------------------------------------------------------
+        pytest.param("foo>=1.2.3"     , Version_("1.2.3")     ,  True, id="GE Spec, EQ Version"),
+        pytest.param("foo>=1.2.3"     , Version_("1.2.5")     ,  True, id="GE Spec, GT Version"),
+        pytest.param("foo>=1.2.3"     , Version_("1.2.2")     , False, id="GE Spec, LT Version"),
+        pytest.param("foo>=1.2.3+rocm", Version_("1.2.3+rocm"),  True, id="GE Spec, Compatible Version with suffix"),
+        pytest.param("foo>=1.2.3"     , Version_("1.2.3+cuda"), False, id="GE Spec, Compatible Version, Extra Suffix"),
+        pytest.param("foo>=1.2.3+cuda", Version_("1.2.3")     , False, id="GE Spec, Compatible Version, Missing Suffix"),
+        pytest.param("foo>=1.2.3+cuda", Version_("1.2.3+rocm"), False, id="GE Spec, Compatible Version, Mismatched Suffix"),
+    )
+)
+# fmt: on
+def test_parse_requirement_comparison_fn(spec, ver, should_pass):
+    _, _, cmp = parse_requirement(spec)
+    assert cmp(ver) == should_pass
+
+
+@pytest.mark.parametrize(
+    "spec, ctx",
+    (
+        *(
+            pytest.param(
+                f"thing{symbol}",
+                pytest.raises(ValueError, match="Invalid requirement string:"),
+                id=f"No version w/ operator {symbol}",
+            )
+            for symbol in _SUPPORTED_OPERATORS
+        ),
+        pytest.param(
+            "thing>=>1.2.3",
+            pytest.raises(ValueError, match="Invalid requirement string:"),
+            id="Operator too long",
+        ),
+        pytest.param(
+            "thing<>1.2.3",
+            pytest.raises(ValueError, match="Unrecognized comparison operator: <>"),
+            id="Nonsense operator",
+        ),
+    ),
+)
+def test_parse_requirement_errors_on_invalid_spec(spec, ctx):
+    with ctx:
+        parse_requirement(spec)
diff --git a/tests/_legacy/install/test_mlpackage.py b/tests/_legacy/install/test_mlpackage.py
new file mode 100644
index 0000000000..d27e69b2ba
--- /dev/null
+++ b/tests/_legacy/install/test_mlpackage.py
@@ -0,0 +1,122 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import pathlib
+from unittest.mock import MagicMock
+
+import pytest
+
+from smartsim._core._install.mlpackages import (
+    MLPackage,
+    MLPackageCollection,
+    RAIPatch,
+    load_platform_configs,
+)
+from smartsim._core._install.platform import Platform
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+mock_platform = MagicMock(spec=Platform)
+
+
+@pytest.fixture
+def mock_ml_packages():
+    foo = MagicMock(spec=MLPackage)
+    foo.name = "foo"
+    bar = MagicMock(spec=MLPackage)
+    bar.name = "bar"
+    yield [foo, bar]
+
+
+@pytest.mark.parametrize(
+    "patch",
+    [MagicMock(spec=RAIPatch), [MagicMock(spec=RAIPatch) for i in range(3)], ()],
+    ids=["one patch", "multiple patches", "no patch"],
+)
+def test_mlpackage_constructor(patch):
+    MLPackage(
+        "foo",
+        "0.0.0",
+        "https://nothing.com",
+        ["bar==0.1", "baz==0.2"],
+        pathlib.Path("/nothing/fake"),
+        patch,
+    )
+
+
+def test_mlpackage_collection_constructor(mock_ml_packages):
+    MLPackageCollection(mock_platform, mock_ml_packages)
+
+
+def test_mlpackage_collection_mutable_mapping_methods(mock_ml_packages):
+    ml_packages = MLPackageCollection(mock_platform, mock_ml_packages)
+    for val in ml_packages._ml_packages.values():
+        val.version = "0.0.0"
+    assert ml_packages._ml_packages == ml_packages
+
+    # Test iter
+    package_names = [pkg.name for pkg in mock_ml_packages]
+    assert [name for name in ml_packages] == package_names
+
+    # Test get item
+    for pkg in mock_ml_packages:
+        assert ml_packages[pkg.name] is pkg
+
+    # Test len
+    assert len(ml_packages) == len(mock_ml_packages)
+
+    # Test delitem
+    key = next(iter(mock_ml_packages)).name
+    del ml_packages[key]
+    with pytest.raises(KeyError):
+        ml_packages[key]
+    assert len(ml_packages) == (len(mock_ml_packages) - 1)
+
+    # Test setitem
+    with pytest.raises(TypeError):
+        ml_packages["baz"] = MagicMock(spec=MLPackage)
+
+    # Test contains
+    name, package = next(iter(ml_packages.items()))
+    assert name in ml_packages
+
+    # Test str
+    assert "Package" in str(ml_packages)
+    assert "Version" in str(ml_packages)
+    assert package.version in str(ml_packages)
+    assert name in str(ml_packages)
+
+
+def test_load_configs_raises_when_dir_dne(test_dir):
+    dne_dir = pathlib.Path(test_dir, "dne")
+    dir_str = os.fspath(dne_dir)
+    with pytest.raises(
+        FileNotFoundError,
+        match=f"Platform configuration directory `{dir_str}` does not exist",
+    ):
+        load_platform_configs(dne_dir)
diff --git a/tests/_legacy/install/test_package_retriever.py b/tests/_legacy/install/test_package_retriever.py
new file mode 100644
index 0000000000..d415ae2358
--- /dev/null
+++ b/tests/_legacy/install/test_package_retriever.py
@@ -0,0 +1,106 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import contextlib
+import filecmp
+import os
+import pathlib
+import random
+import string
+import tarfile
+import zipfile
+
+import pytest
+
+from smartsim._core._install.utils import retrieve
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+
+@contextlib.contextmanager
+def temp_cd(path):
+    original = os.getcwd()
+    os.chdir(path)
+    try:
+        yield
+    finally:
+        os.chdir(original)
+
+
+def make_test_file(test_file):
+    data = "".join(random.choices(string.ascii_letters + string.digits, k=1024))
+    with open(test_file, "w") as f:
+        f.write(data)
+
+
+def test_local_archive_zip(test_dir):
+    with temp_cd(test_dir):
+        test_file = "./test.data"
+        make_test_file(test_file)
+
+        zip_file = "./test.zip"
+        with zipfile.ZipFile(zip_file, "w") as f:
+            f.write(test_file)
+
+        retrieve(zip_file, pathlib.Path("./output"))
+
+        assert filecmp.cmp(
+            test_file, pathlib.Path("./output") / "test.data", shallow=False
+        )
+
+
+def test_local_archive_tgz(test_dir):
+    with temp_cd(test_dir):
+        test_file = "./test.data"
+        make_test_file(test_file)
+
+        tgz_file = "./test.tgz"
+        with tarfile.open(tgz_file, "w:gz") as f:
+            f.add(test_file)
+
+        retrieve(tgz_file, pathlib.Path("./output"))
+
+        assert filecmp.cmp(
+            test_file, pathlib.Path("./output") / "test.data", shallow=False
+        )
+
+
+def test_git(test_dir):
+    retrieve(
+        "https://github.com/CrayLabs/SmartSim.git",
+        f"{test_dir}/smartsim_git",
+        branch="master",
+    )
+    assert pathlib.Path(f"{test_dir}/smartsim_git").is_dir()
+
+
+def test_https(test_dir):
+    output_dir = pathlib.Path(test_dir) / "output"
+    retrieve(
+        "https://github.com/CrayLabs/SmartSim/archive/refs/tags/v0.5.0.zip", output_dir
+    )
+    assert output_dir.exists()
diff --git a/tests/_legacy/install/test_platform.py b/tests/_legacy/install/test_platform.py
new file mode 100644
index 0000000000..76ff3f76b1
--- /dev/null
+++ b/tests/_legacy/install/test_platform.py
@@ -0,0 +1,89 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import os
+import platform
+
+import pytest
+
+from smartsim._core._install.platform import Architecture, Device, OperatingSystem
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+
+def test_device_cpu():
+    cpu_enum = Device.CPU
+    assert not cpu_enum.is_gpu()
+    assert not cpu_enum.is_cuda()
+    assert not cpu_enum.is_rocm()
+
+
+@pytest.mark.parametrize("cuda_device", Device.cuda_enums())
+def test_cuda(monkeypatch, test_dir, cuda_device):
+    version = cuda_device.value.split("-")[1]
+    fake_full_version = version + ".8888" ".9999"
+    monkeypatch.setenv("CUDA_HOME", test_dir)
+
+    mock_version = dict(cuda=dict(version=fake_full_version))
+    print(mock_version)
+    with open(f"{test_dir}/version.json", "w") as outfile:
+        json.dump(mock_version, outfile)
+
+    assert Device.detect_cuda_version() == cuda_device
+    assert cuda_device.is_gpu()
+    assert cuda_device.is_cuda()
+    assert not cuda_device.is_rocm()
+
+
+@pytest.mark.parametrize("rocm_device", Device.rocm_enums())
+def test_rocm(monkeypatch, test_dir, rocm_device):
+    version = rocm_device.value.split("-")[1]
+    fake_full_version = version + ".8888" + "-9999"
+    monkeypatch.setenv("ROCM_HOME", test_dir)
+    info_dir = f"{test_dir}/.info"
+    os.mkdir(info_dir)
+
+    with open(f"{info_dir}/version", "w") as outfile:
+        outfile.write(fake_full_version)
+
+    assert Device.detect_rocm_version() == rocm_device
+    assert rocm_device.is_gpu()
+    assert not rocm_device.is_cuda()
+    assert rocm_device.is_rocm()
+
+
+@pytest.mark.parametrize("os", ("linux", "darwin"))
+def test_operating_system(monkeypatch, os):
+    monkeypatch.setattr(platform, "system", lambda: os)
+    assert OperatingSystem.autodetect().value == os
+
+
+@pytest.mark.parametrize("arch", ("x86_64", "arm64"))
+def test_architecture(monkeypatch, arch):
+    monkeypatch.setattr(platform, "machine", lambda: arch)
+    assert Architecture.autodetect().value == arch
diff --git a/tests/_legacy/install/test_redisai_builder.py b/tests/_legacy/install/test_redisai_builder.py
new file mode 100644
index 0000000000..81673a7f12
--- /dev/null
+++ b/tests/_legacy/install/test_redisai_builder.py
@@ -0,0 +1,60 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from pathlib import Path
+
+import pytest
+
+from smartsim._core._install.buildenv import BuildEnv
+from smartsim._core._install.mlpackages import (
+    DEFAULT_MLPACKAGE_PATH,
+    MLPackage,
+    load_platform_configs,
+)
+from smartsim._core._install.platform import Platform
+from smartsim._core._install.redisaiBuilder import RedisAIBuilder
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+DEFAULT_MLPACKAGES = load_platform_configs(DEFAULT_MLPACKAGE_PATH)
+
+
+@pytest.mark.parametrize(
+    "platform",
+    [platform for platform in DEFAULT_MLPACKAGES],
+    ids=[str(platform) for platform in DEFAULT_MLPACKAGES],
+)
+def test_backends_to_be_installed(monkeypatch, test_dir, platform):
+    mlpackages = DEFAULT_MLPACKAGES[platform]
+    monkeypatch.setattr(MLPackage, "retrieve", lambda *args, **kwargs: None)
+    builder = RedisAIBuilder(platform, mlpackages, BuildEnv(), Path(test_dir))
+
+    BACKENDS = ["libtorch", "libtensorflow", "onnxruntime"]
+    TOGGLES = ["build_torch", "build_tensorflow", "build_onnxruntime"]
+
+    for backend, toggle in zip(BACKENDS, TOGGLES):
+        assert getattr(builder, toggle) == (backend in mlpackages)
diff --git a/tests/_legacy/on_wlm/test_dragon.py b/tests/_legacy/on_wlm/test_dragon.py
index b685b65020..d835d60ce1 100644
--- a/tests/_legacy/on_wlm/test_dragon.py
+++ b/tests/_legacy/on_wlm/test_dragon.py
@@ -56,7 +56,7 @@ def test_dragon_global_path(global_dragon_teardown, wlmutils, test_dir, monkeypa
 
 def test_dragon_exp_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch):
     monkeypatch.delenv("SMARTSIM_DRAGON_SERVER_PATH", raising=False)
-    monkeypatch.delenv("SMARTSIM_DRAGON_SERVER_PATH_EXP", raising=False)
+    monkeypatch.delenv("_SMARTSIM_DRAGON_SERVER_PATH_EXP", raising=False)
     exp: Experiment = Experiment(
         "test_dragon_connection",
         exp_path=test_dir,
diff --git a/tests/_legacy/test_cli.py b/tests/_legacy/test_cli.py
index 397f1196c6..c47ea046b7 100644
--- a/tests/_legacy/test_cli.py
+++ b/tests/_legacy/test_cli.py
@@ -436,24 +436,22 @@ def mock_execute(ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = No
 
 # fmt: off
 @pytest.mark.parametrize(
-    "command,mock_location,exp_output,optional_arg,exp_valid,exp_err_msg,check_prop,exp_prop_val",
+                       "command,      mock_location,                    exp_output,            optional_arg, exp_valid,                    exp_err_msg,  check_prop, exp_prop_val",
     [
-        pytest.param("build", "build_execute", "verbose mocked-build", "-v", True, "", "v", True, id="verbose 'on'"),
-        pytest.param("build", "build_execute", "cpu mocked-build", "--device=cpu", True, "", "device", "cpu", id="device 'cpu'"),
-        pytest.param("build", "build_execute", "gpu mocked-build", "--device=gpu", True, "", "device", "gpu", id="device 'gpu'"),
-        pytest.param("build", "build_execute", "gpuX mocked-build", "--device=gpux", False, "invalid choice: 'gpux'", "", "", id="set bad device 'gpuX'"),
-        pytest.param("build", "build_execute", "no tensorflow mocked-build", "--no_tf", True, "", "no_tf", True, id="set no TF"),
-        pytest.param("build", "build_execute", "no torch mocked-build", "--no_pt", True, "", "no_pt", True, id="set no torch"),
-        pytest.param("build", "build_execute", "onnx mocked-build", "--onnx", True, "", "onnx", True, id="set w/onnx"),
-        pytest.param("build", "build_execute", "torch-dir mocked-build", "--torch_dir /foo/bar", True, "", "torch_dir", "/foo/bar", id="set torch dir"),
-        pytest.param("build", "build_execute", "bad-torch-dir mocked-build", "--torch_dir", False, "error: argument --torch_dir", "", "", id="set torch dir, no path"),
-        pytest.param("build", "build_execute", "keydb mocked-build", "--keydb", True, "", "keydb", True, id="keydb on"),
-        pytest.param("clean", "clean_execute", "clobbering mocked-clean", "--clobber", True, "", "clobber", True, id="clean w/clobber"),
-        pytest.param("validate", "validate_execute", "port mocked-validate", "--port=12345", True, "", "port", 12345, id="validate w/ manual port"),
-        pytest.param("validate", "validate_execute", "abbrv port mocked-validate", "-p 12345", True, "", "port", 12345, id="validate w/ manual abbreviated port"),
-        pytest.param("validate", "validate_execute", "cpu mocked-validate", "--device=cpu", True, "", "device", "cpu", id="validate: device 'cpu'"),
-        pytest.param("validate", "validate_execute", "gpu mocked-validate", "--device=gpu", True, "", "device", "gpu", id="validate: device 'gpu'"),
-        pytest.param("validate", "validate_execute", "gpuX mocked-validate", "--device=gpux", False, "invalid choice: 'gpux'", "", "", id="validate: set bad device 'gpuX'"),
+        pytest.param(   "build",    "build_execute",        "verbose mocked-build",                    "-v",      True,                             "",         "v",         True, id="verbose 'on'"),
+        pytest.param(   "build",    "build_execute",            "cpu mocked-build",          "--device=cpu",      True,                             "",    "device",        "cpu", id="device 'cpu'"),
+        pytest.param(   "build",    "build_execute",           "gpuX mocked-build",         "--device=gpux",     False,       "invalid choice: 'gpux'",          "",           "", id="set bad device 'gpuX'"),
+        pytest.param(   "build",    "build_execute",  "no tensorflow mocked-build",     "--skip-tensorflow",      True,                             "",     "no_tf",         True, id="Skip TF"),
+        pytest.param(   "build",    "build_execute",       "no torch mocked-build",          "--skip-torch",      True,                             "",     "no_pt",         True, id="Skip Torch"),
+        pytest.param(   "build",    "build_execute",           "onnx mocked-build",           "--skip-onnx",      True,                             "",      "onnx",         True, id="Skip Onnx"),
+        pytest.param(   "build",    "build_execute",     "config-dir mocked-build", "--config-dir /foo/bar",      True,                             "", "config-dir",   "/foo/bar", id="set torch dir"),
+        pytest.param(   "build",    "build_execute", "bad-config-dir mocked-build",          "--config-dir",     False, "error: argument --config-dir",          "",           "", id="set config dir w/o path"),
+        pytest.param(   "clean",    "clean_execute",     "clobbering mocked-clean",             "--clobber",      True,                             "",   "clobber",         True, id="clean w/clobber"),
+        pytest.param("validate", "validate_execute",        "port mocked-validate",          "--port=12345",      True,                             "",      "port",        12345, id="validate w/ manual port"),
+        pytest.param("validate", "validate_execute",  "abbrv port mocked-validate",              "-p 12345",      True,                             "",      "port",        12345, id="validate w/ manual abbreviated port"),
+        pytest.param("validate", "validate_execute",         "cpu mocked-validate",          "--device=cpu",      True,                             "",    "device",        "cpu", id="validate: device 'cpu'"),
+        pytest.param("validate", "validate_execute",         "gpu mocked-validate",          "--device=gpu",      True,                             "",    "device",        "gpu", id="validate: device 'gpu'"),
+        pytest.param("validate", "validate_execute",        "gpuX mocked-validate",         "--device=gpux",     False,       "invalid choice: 'gpux'",          "",           "", id="validate: set bad device 'gpuX'"),
     ]
 )
 # fmt: on
@@ -733,19 +731,7 @@ def mock_operation(*args, **kwargs) -> int:
 
     # mock out the internal get_fs_path method so we don't actually do file system ops
     monkeypatch.setattr(smartsim._core._cli.build, "tabulate", mock_operation)
-    monkeypatch.setattr(
-        smartsim._core._cli.build, "build_feature_store", mock_operation
-    )
     monkeypatch.setattr(smartsim._core._cli.build, "build_redis_ai", mock_operation)
-    monkeypatch.setattr(
-        smartsim._core._cli.build, "check_py_torch_version", mock_operation
-    )
-    monkeypatch.setattr(
-        smartsim._core._cli.build, "check_py_tf_version", mock_operation
-    )
-    monkeypatch.setattr(
-        smartsim._core._cli.build, "check_py_onnx_version", mock_operation
-    )
 
     command = "build"
     cfg = MenuItemConfig(
diff --git a/tests/_legacy/test_colo_model_local.py b/tests/_legacy/test_colo_model_local.py
index 1ab97c4cc3..54848907d3 100644
--- a/tests/_legacy/test_colo_model_local.py
+++ b/tests/_legacy/test_colo_model_local.py
@@ -29,7 +29,7 @@
 import pytest
 
 from smartsim import Experiment
-from smartsim.entity import Application
+from smartsim._core.utils.helpers import _create_pinning_string
 from smartsim.error import SSUnsupportedError
 from smartsim.status import JobStatus
 
@@ -116,7 +116,7 @@ def test_unsupported_custom_pinning(fileutils, test_dir, coloutils, custom_pinni
     ],
 )
 def test_create_pinning_string(pin_list, num_cpus, expected):
-    assert Application._create_pinning_string(pin_list, num_cpus) == expected
+    assert _create_pinning_string(pin_list, num_cpus) == expected
 
 
 @pytest.mark.parametrize("fs_type", supported_fss)
diff --git a/tests/_legacy/test_config.py b/tests/_legacy/test_config.py
index 00a1fcdd36..5a84103ffd 100644
--- a/tests/_legacy/test_config.py
+++ b/tests/_legacy/test_config.py
@@ -66,9 +66,9 @@ def get_redisai_env(
     """
     env = os.environ.copy()
     if rai_path is not None:
-        env["RAI_PATH"] = rai_path
+        env["SMARTSIM_RAI_LIB"] = rai_path
     else:
-        env.pop("RAI_PATH", None)
+        env.pop("SMARTSIM_RAI_LIB", None)
 
     if lib_path is not None:
         env["SMARTSIM_DEP_INSTALL_PATH"] = lib_path
@@ -85,7 +85,7 @@ def make_file(filepath: str) -> None:
 
 
 def test_redisai_invalid_rai_path(test_dir, monkeypatch):
-    """An invalid RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should fail"""
+    """An invalid SMARTSIM_RAI_LIB and valid SMARTSIM_DEP_INSTALL_PATH should fail"""
 
     rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so")
     make_file(os.path.join(test_dir, "lib", "redisai.so"))
@@ -94,7 +94,7 @@ def test_redisai_invalid_rai_path(test_dir, monkeypatch):
 
     config = Config()
 
-    # Fail when no file exists @ RAI_PATH
+    # Fail when no file exists @ SMARTSIM_RAI_LIB
     with pytest.raises(SSConfigError) as ex:
         _ = config.redisai
 
@@ -102,7 +102,7 @@ def test_redisai_invalid_rai_path(test_dir, monkeypatch):
 
 
 def test_redisai_valid_rai_path(test_dir, monkeypatch):
-    """A valid RAI_PATH should override valid SMARTSIM_DEP_INSTALL_PATH and succeed"""
+    """A valid SMARTSIM_RAI_LIB should override valid SMARTSIM_DEP_INSTALL_PATH and succeed"""
 
     rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so")
     make_file(rai_file_path)
@@ -117,7 +117,7 @@ def test_redisai_valid_rai_path(test_dir, monkeypatch):
 
 
 def test_redisai_invalid_lib_path(test_dir, monkeypatch):
-    """Invalid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should fail"""
+    """Invalid SMARTSIM_RAI_LIB and invalid SMARTSIM_DEP_INSTALL_PATH should fail"""
 
     rai_file_path = f"{test_dir}/railib/redisai.so"
 
@@ -133,7 +133,7 @@ def test_redisai_invalid_lib_path(test_dir, monkeypatch):
 
 
 def test_redisai_valid_lib_path(test_dir, monkeypatch):
-    """Valid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should succeed"""
+    """Valid SMARTSIM_RAI_LIB and invalid SMARTSIM_DEP_INSTALL_PATH should succeed"""
 
     rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so")
     make_file(rai_file_path)
@@ -147,7 +147,7 @@ def test_redisai_valid_lib_path(test_dir, monkeypatch):
 
 
 def test_redisai_valid_lib_path_null_rai(test_dir, monkeypatch):
-    """Missing RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should succeed"""
+    """Missing SMARTSIM_RAI_LIB and valid SMARTSIM_DEP_INSTALL_PATH should succeed"""
 
     rai_file_path: t.Optional[str] = None
     lib_file_path = os.path.join(test_dir, "lib", "redisai.so")
@@ -166,11 +166,11 @@ def test_redis_conf():
     assert Path(config.database_conf).is_file()
     assert isinstance(config.database_conf, str)
 
-    os.environ["REDIS_CONF"] = "not/a/path"
+    os.environ["SMARTSIM_REDIS_CONF"] = "not/a/path"
     config = Config()
     with pytest.raises(SSConfigError):
         config.database_conf
-    os.environ.pop("REDIS_CONF")
+    os.environ.pop("SMARTSIM_REDIS_CONF")
 
 
 def test_redis_exe():
@@ -178,11 +178,11 @@ def test_redis_exe():
     assert Path(config.database_exe).is_file()
     assert isinstance(config.database_exe, str)
 
-    os.environ["REDIS_PATH"] = "not/a/path"
+    os.environ["SMARTSIM_REDIS_SERVER_EXE"] = "not/a/path"
     config = Config()
     with pytest.raises(SSConfigError):
         config.database_exe
-    os.environ.pop("REDIS_PATH")
+    os.environ.pop("SMARTSIM_REDIS_SERVER_EXE")
 
 
 def test_redis_cli():
@@ -190,11 +190,11 @@ def test_redis_cli():
     assert Path(config.redisai).is_file()
     assert isinstance(config.redisai, str)
 
-    os.environ["REDIS_CLI_PATH"] = "not/a/path"
+    os.environ["SMARTSIM_REDIS_CLI_EXE"] = "not/a/path"
     config = Config()
     with pytest.raises(SSConfigError):
         config.database_cli
-    os.environ.pop("REDIS_CLI_PATH")
+    os.environ.pop("SMARTSIM_REDIS_CLI_EXE")
 
 
 @pytest.mark.parametrize(
diff --git a/tests/_legacy/test_dragon_installer.py b/tests/_legacy/test_dragon_installer.py
index b23a1a7ef0..8ce7404c5f 100644
--- a/tests/_legacy/test_dragon_installer.py
+++ b/tests/_legacy/test_dragon_installer.py
@@ -31,12 +31,17 @@
 from collections import namedtuple
 
 import pytest
+from github.GitRelease import GitRelease
 from github.GitReleaseAsset import GitReleaseAsset
 from github.Requester import Requester
 
 import smartsim
+import smartsim._core._install.utils
 import smartsim._core.utils.helpers as helpers
 from smartsim._core._cli.scripts.dragon_install import (
+    DEFAULT_DRAGON_REPO,
+    DEFAULT_DRAGON_VERSION,
+    DragonInstallRequest,
     cleanup,
     create_dotenv,
     install_dragon,
@@ -58,14 +63,25 @@
 def test_archive(test_dir: str, archive_path: pathlib.Path) -> pathlib.Path:
     """Fixture for returning a simple tarfile to test on"""
     num_files = 10
+
+    archive_name = archive_path.name
+    archive_name = archive_name.replace(".tar.gz", "")
+
     with tarfile.TarFile.open(archive_path, mode="w:gz") as tar:
-        mock_whl = pathlib.Path(test_dir) / "mock.whl"
+        mock_whl = pathlib.Path(test_dir) / archive_name / f"{archive_name}.whl"
+        mock_whl.parent.mkdir(parents=True, exist_ok=True)
         mock_whl.touch()
 
+        tar.add(mock_whl)
+
         for i in range(num_files):
-            content = pathlib.Path(test_dir) / f"{i:04}.txt"
+            content = pathlib.Path(test_dir) / archive_name / f"{i:04}.txt"
             content.write_text(f"i am file {i}\n")
             tar.add(content)
+            content.unlink()
+
+        mock_whl.unlink()
+
     return archive_path
 
 
@@ -118,11 +134,41 @@ def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset]
                     _git_attr(value=f"http://foo/{archive_name}"),
                 )
                 monkeypatch.setattr(asset, "_name", _git_attr(value=archive_name))
+                monkeypatch.setattr(asset, "_id", _git_attr(value=123))
                 assets.append(asset)
 
     return assets
 
 
+@pytest.fixture
+def test_releases(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitRelease]:
+    requester = Requester(
+        auth=None,
+        base_url="https://github.com",
+        user_agent="mozilla",
+        per_page=10,
+        verify=False,
+        timeout=1,
+        retry=1,
+        pool_size=1,
+    )
+    headers = {"mock-header": "mock-value"}
+    attributes = {"title": "mock-title"}
+    completed = True
+
+    releases: t.List[GitRelease] = []
+
+    for python_version in ["py3.9", "py3.10", "py3.11"]:
+        for dragon_version in ["dragon-0.8", "dragon-0.9", "dragon-0.10"]:
+            attributes = {
+                "title": f"{python_version}-{dragon_version}-release",
+                "tag_name": f"v{dragon_version}-weekly",
+            }
+            releases.append(GitRelease(requester, headers, attributes, completed))
+
+    return releases
+
+
 def test_cleanup_no_op(archive_path: pathlib.Path) -> None:
     """Ensure that the cleanup method doesn't bomb when called with
     missing archive path; simulate a failed download"""
@@ -143,17 +189,25 @@ def test_cleanup_archive_exists(test_archive: pathlib.Path) -> None:
     assert not test_archive.exists()
 
 
-def test_retrieve_cached(
-    test_dir: str,
-    # archive_path: pathlib.Path,
+@pytest.mark.skip("Deprecated due to builder.py changes")
+def test_retrieve_updated(
     test_archive: pathlib.Path,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    """Verify that a previously retrieved asset archive is re-used"""
-    with tarfile.TarFile.open(test_archive) as tar:
-        tar.extractall(test_dir)
+    """Verify that a previously retrieved asset archive is not re-used if a new
+    version is found"""
 
-    ts1 = test_archive.parent.stat().st_ctime
+    old_asset_id = 100
+    asset_id = 123
+
+    def mock__retrieve_archive(source_, destination_) -> None:
+        mock_extraction_dir = pathlib.Path(destination_)
+        with tarfile.TarFile.open(test_archive) as tar:
+            tar.extractall(mock_extraction_dir)
+
+    # we'll use the mock extract to create the files that would normally be downloaded
+    expected_output_dir = test_archive.parent / str(asset_id)
+    old_output_dir = test_archive.parent / str(old_asset_id)
 
     requester = Requester(
         auth=None,
@@ -174,14 +228,22 @@ def test_retrieve_cached(
     # ensure mocked asset has values that we use...
     monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo"))
     monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name))
+    monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id))
+    monkeypatch.setattr(
+        smartsim._core._install.utils,
+        "retrieve",
+        lambda s_, d_: mock__retrieve_archive(s_, expected_output_dir),
+    )  # mock the retrieval of the updated archive
+
+    # tell it to retrieve. it should return the path to the new download, not the old one
+    request = DragonInstallRequest(test_archive.parent)
+    asset_path = retrieve_asset(request, asset)
 
-    asset_path = retrieve_asset(test_archive.parent, asset)
-    ts2 = asset_path.stat().st_ctime
+    # sanity check we don't have the same paths
+    assert old_output_dir != expected_output_dir
 
-    assert (
-        asset_path == test_archive.parent
-    )  # show that the expected path matches the output path
-    assert ts1 == ts2  # show that the file wasn't changed...
+    # verify the "cached" copy wasn't used
+    assert asset_path == expected_output_dir
 
 
 @pytest.mark.parametrize(
@@ -214,11 +276,13 @@ def test_retrieve_cached(
 )
 def test_retrieve_asset_info(
     test_assets: t.Collection[GitReleaseAsset],
+    test_releases: t.Collection[GitRelease],
     monkeypatch: pytest.MonkeyPatch,
     dragon_pin: str,
     pyv: str,
     is_found: bool,
     is_crayex: bool,
+    test_dir: str,
 ) -> None:
     """Verify that an information is retrieved correctly based on the python
     version, platform (e.g. CrayEX, !CrayEx), and target dragon pin"""
@@ -234,20 +298,23 @@ def test_retrieve_asset_info(
             "is_crayex_platform",
             lambda: is_crayex,
         )
+        # avoid hitting github API
         ctx.setattr(
             smartsim._core._cli.scripts.dragon_install,
-            "dragon_pin",
-            lambda: dragon_pin,
+            "_get_all_releases",
+            lambda x: test_releases,
         )
         # avoid hitting github API
         ctx.setattr(
             smartsim._core._cli.scripts.dragon_install,
             "_get_release_assets",
-            lambda: test_assets,
+            lambda x: test_assets,
         )
 
+        request = DragonInstallRequest(test_dir, version=dragon_pin)
+
         if is_found:
-            chosen_asset = retrieve_asset_info()
+            chosen_asset = retrieve_asset_info(request)
 
             assert chosen_asset
             assert pyv in chosen_asset.name
@@ -259,7 +326,7 @@ def test_retrieve_asset_info(
                 assert "crayex" not in chosen_asset.name.lower()
         else:
             with pytest.raises(SmartSimCLIActionCancelled):
-                retrieve_asset_info()
+                retrieve_asset_info(request)
 
 
 def test_check_for_utility_missing(test_dir: str) -> None:
@@ -357,23 +424,56 @@ def mock_util_check(util: str) -> bool:
         assert is_cray == platform_result
 
 
-def test_install_package_no_wheel(extraction_dir: pathlib.Path):
+def test_install_package_no_wheel(test_dir: str, extraction_dir: pathlib.Path):
     """Verify that a missing wheel does not blow up and has a failure retcode"""
     exp_path = extraction_dir
+    request = DragonInstallRequest(test_dir)
 
-    result = install_package(exp_path)
+    result = install_package(request, exp_path)
     assert result != 0
 
 
 def test_install_macos(monkeypatch: pytest.MonkeyPatch, extraction_dir: pathlib.Path):
-    """Verify that installation exits cleanly if installing on unsupported platform"""
+    """Verify that installation exits cleanly if installing on unsupported platform."""
     with monkeypatch.context() as ctx:
         ctx.setattr(sys, "platform", "darwin")
 
-        result = install_dragon(extraction_dir)
+        request = DragonInstallRequest(extraction_dir)
+
+        result = install_dragon(request)
         assert result == 1
 
 
+@pytest.mark.parametrize(
+    "version, exp_result",
+    [
+        pytest.param("0.9", 2, id="0.9 DNE In Public Repo"),
+        pytest.param("0.91", 2, id="0.91 DNE In Public Repo"),
+        pytest.param("0.10", 0, id="0.10 Exists In Public Repo"),
+        pytest.param("0.19", 2, id="0.19 DNE In Public Repo"),
+    ],
+)
+def test_install_specify_asset_version(
+    monkeypatch: pytest.MonkeyPatch,
+    extraction_dir: pathlib.Path,
+    version: str,
+    exp_result: int,
+):
+    """Verify that installation completes as expected when fed a variety of
+    version numbers that can or cannot be found on release assets of the
+    public dragon repository.
+
+    :param extraction_dir: file system path where the dragon package should
+    be downloaded and extracted
+    :param version: Dragon version number to attempt to install
+    :param exp_result: Expected return code from the call to `install_dragon`
+    """
+    request = DragonInstallRequest(extraction_dir, version=version)
+
+    result = install_dragon(request)
+    assert result == exp_result
+
+
 def test_create_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir: str):
     """Verify that attempting to create a .env file without any existing
     file or container directory works"""
@@ -387,7 +487,7 @@ def test_create_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir: str):
         # ensure no .env exists before trying to create it.
         assert not exp_env_path.exists()
 
-        create_dotenv(mock_dragon_root)
+        create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION)
 
         # ensure the .env is created as side-effect of create_dotenv
         assert exp_env_path.exists()
@@ -409,7 +509,7 @@ def test_create_dotenv_existing_dir(monkeypatch: pytest.MonkeyPatch, test_dir: s
         # ensure no .env exists before trying to create it.
         assert not exp_env_path.exists()
 
-        create_dotenv(mock_dragon_root)
+        create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION)
 
         # ensure the .env is created as side-effect of create_dotenv
         assert exp_env_path.exists()
@@ -434,17 +534,25 @@ def test_create_dotenv_existing_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir
         # ensure .env exists so we can update it
         assert exp_env_path.exists()
 
-        create_dotenv(mock_dragon_root)
+        create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION)
 
         # ensure the .env is created as side-effect of create_dotenv
         assert exp_env_path.exists()
 
         # ensure file was overwritten and env vars are not duplicated
         dotenv_content = exp_env_path.read_text(encoding="utf-8")
-        split_content = dotenv_content.split(var_name)
-
-        # split to confirm env var only appars once
-        assert len(split_content) == 2
+        lines = [
+            line for line in dotenv_content.split("\n") if line and not "#" in line
+        ]
+        for line in lines:
+            if line.startswith(var_name):
+                # make sure the var isn't defined recursively
+                # DRAGON_BASE_DIR=$DRAGON_BASE_DIR
+                assert var_name not in line[len(var_name) + 1 :]
+            else:
+                # make sure any values reference the original base dir var
+                if var_name in line:
+                    assert f"${var_name}" in line
 
 
 def test_create_dotenv_format(monkeypatch: pytest.MonkeyPatch, test_dir: str):
@@ -456,13 +564,13 @@ def test_create_dotenv_format(monkeypatch: pytest.MonkeyPatch, test_dir: str):
     with monkeypatch.context() as ctx:
         ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path)
 
-        create_dotenv(mock_dragon_root)
+        create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION)
 
         # ensure the .env is created as side-effect of create_dotenv
         content = exp_env_path.read_text(encoding="utf-8")
 
         # ensure we have values written, but ignore empty lines
-        lines = [line for line in content.split("\n") if line]
+        lines = [line for line in content.split("\n") if line and not "#" in line]
         assert lines
 
         # ensure each line is formatted as key=value
diff --git a/tests/_legacy/test_dragon_launcher.py b/tests/_legacy/test_dragon_launcher.py
index 77f094b7d7..c4f241b24b 100644
--- a/tests/_legacy/test_dragon_launcher.py
+++ b/tests/_legacy/test_dragon_launcher.py
@@ -37,7 +37,10 @@
 import zmq
 
 import smartsim._core.config
-from smartsim._core._cli.scripts.dragon_install import create_dotenv
+from smartsim._core._cli.scripts.dragon_install import (
+    DEFAULT_DRAGON_VERSION,
+    create_dotenv,
+)
 from smartsim._core.config.config import get_config
 from smartsim._core.launcher.dragon.dragon_launcher import (
     DragonConnector,
@@ -494,7 +497,7 @@ def test_load_env_env_file_created(monkeypatch: pytest.MonkeyPatch, test_dir: st
 
     with monkeypatch.context() as ctx:
         ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path)
-        create_dotenv(mock_dragon_root)
+        create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION)
         dragon_conf = smartsim._core.config.CONFIG.dragon_dotenv
 
         # verify config does exist
@@ -507,7 +510,26 @@ def test_load_env_env_file_created(monkeypatch: pytest.MonkeyPatch, test_dir: st
         assert loaded_env
 
         # confirm .env was parsed as expected by inspecting a key
+        assert "DRAGON_BASE_DIR" in loaded_env
+        base_dir = loaded_env["DRAGON_BASE_DIR"]
+
         assert "DRAGON_ROOT_DIR" in loaded_env
+        assert loaded_env["DRAGON_ROOT_DIR"] == base_dir
+
+        assert "DRAGON_INCLUDE_DIR" in loaded_env
+        assert loaded_env["DRAGON_INCLUDE_DIR"] == f"{base_dir}/include"
+
+        assert "DRAGON_LIB_DIR" in loaded_env
+        assert loaded_env["DRAGON_LIB_DIR"] == f"{base_dir}/lib"
+
+        assert "DRAGON_VERSION" in loaded_env
+        assert loaded_env["DRAGON_VERSION"] == DEFAULT_DRAGON_VERSION
+
+        assert "PATH" in loaded_env
+        assert loaded_env["PATH"] == f"{base_dir}/bin"
+
+        assert "LD_LIBRARY_PATH" in loaded_env
+        assert loaded_env["LD_LIBRARY_PATH"] == f"{base_dir}/lib"
 
 
 def test_load_env_cached_env(monkeypatch: pytest.MonkeyPatch, test_dir: str):
@@ -517,7 +539,7 @@ def test_load_env_cached_env(monkeypatch: pytest.MonkeyPatch, test_dir: str):
 
     with monkeypatch.context() as ctx:
         ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path)
-        create_dotenv(mock_dragon_root)
+        create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION)
 
         # load config w/launcher
         connector = DragonConnector()
@@ -541,7 +563,7 @@ def test_merge_env(monkeypatch: pytest.MonkeyPatch, test_dir: str):
 
     with monkeypatch.context() as ctx:
         ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path)
-        create_dotenv(mock_dragon_root)
+        create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION)
 
         # load config w/launcher
         connector = DragonConnector()
@@ -593,11 +615,14 @@ def test_run_step_fail(test_dir: str) -> None:
     step0 = DragonStep("step0", test_dir, rs)
     step0.meta["status_dir"] = status_dir
 
-    mock_connector = MagicMock()  # DragonConnector()
+    mock_connector = MagicMock(spec=DragonConnector)
     mock_connector.is_connected = True
     mock_connector.send_request = MagicMock(
         return_value=DragonRunResponse(step_id=step0.name, error_message="mock fail!")
     )
+    mock_connector.merge_persisted_env = MagicMock(
+        return_value={"FOO": "bar", "BAZ": "boop"}
+    )
 
     launcher = DragonLauncher()
     launcher._connector = mock_connector
@@ -676,7 +701,7 @@ def test_run_step_success(test_dir: str) -> None:
     step0 = DragonStep("step0", test_dir, rs)
     step0.meta["status_dir"] = status_dir
 
-    mock_connector = MagicMock()  # DragonConnector()
+    mock_connector = MagicMock(spec=DragonConnector)
     mock_connector.is_connected = True
     mock_connector.send_request = MagicMock(
         return_value=DragonRunResponse(step_id=step0.name)
@@ -684,6 +709,9 @@ def test_run_step_success(test_dir: str) -> None:
 
     launcher = DragonLauncher()
     launcher._connector = mock_connector
+    mock_connector.merge_persisted_env = MagicMock(
+        return_value={"FOO": "bar", "BAZ": "boop"}
+    )
 
     result = launcher.run(step0)
 
diff --git a/tests/_legacy/test_dragon_run_policy.py b/tests/_legacy/test_dragon_run_policy.py
index 5da84bf305..14219f9a32 100644
--- a/tests/_legacy/test_dragon_run_policy.py
+++ b/tests/_legacy/test_dragon_run_policy.py
@@ -114,9 +114,6 @@ def test_create_run_policy_non_run_request(dragon_request: DragonRequest) -> Non
     policy = DragonBackend.create_run_policy(dragon_request, "localhost")
 
     assert policy is not None, "Default policy was not returned"
-    assert (
-        policy.device == Policy.Device.DEFAULT
-    ), "Default device was not Device.DEFAULT"
     assert policy.cpu_affinity == [], "Default cpu affinity was not empty"
     assert policy.gpu_affinity == [], "Default gpu affinity was not empty"
 
@@ -140,10 +137,8 @@ def test_create_run_policy_run_request_no_run_policy() -> None:
 
     policy = DragonBackend.create_run_policy(run_req, "localhost")
 
-    assert policy.device == Policy.Device.DEFAULT
     assert set(policy.cpu_affinity) == set()
     assert policy.gpu_affinity == []
-    assert policy.affinity == Policy.Affinity.DEFAULT
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -167,7 +162,6 @@ def test_create_run_policy_run_request_default_run_policy() -> None:
 
     assert set(policy.cpu_affinity) == set()
     assert set(policy.gpu_affinity) == set()
-    assert policy.affinity == Policy.Affinity.DEFAULT
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -192,7 +186,6 @@ def test_create_run_policy_run_request_cpu_affinity_no_device() -> None:
 
     assert set(policy.cpu_affinity) == affinity
     assert policy.gpu_affinity == []
-    assert policy.affinity == Policy.Affinity.SPECIFIC
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -216,7 +209,6 @@ def test_create_run_policy_run_request_cpu_affinity() -> None:
 
     assert set(policy.cpu_affinity) == affinity
     assert policy.gpu_affinity == []
-    assert policy.affinity == Policy.Affinity.SPECIFIC
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -240,7 +232,6 @@ def test_create_run_policy_run_request_gpu_affinity() -> None:
 
     assert policy.cpu_affinity == []
     assert set(policy.gpu_affinity) == set(affinity)
-    assert policy.affinity == Policy.Affinity.SPECIFIC
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
diff --git a/tests/_legacy/test_dragon_run_request.py b/tests/_legacy/test_dragon_run_request.py
index f5fdc73a06..a1c1e495f3 100644
--- a/tests/_legacy/test_dragon_run_request.py
+++ b/tests/_legacy/test_dragon_run_request.py
@@ -30,63 +30,23 @@
 import time
 from unittest.mock import MagicMock
 
+import pydantic.error_wrappers
 import pytest
-from pydantic import ValidationError
 
 # The tests in this file belong to the group_b group
 pytestmark = pytest.mark.group_b
-
-try:
-    import dragon
-
-    dragon_loaded = True
-except:
-    dragon_loaded = False
+dragon = pytest.importorskip("dragon")
 
 from smartsim._core.config import CONFIG
+from smartsim._core.launcher.dragon.dragon_backend import (
+    DragonBackend,
+    ProcessGroupInfo,
+)
+from smartsim._core.launcher.dragon.pqueue import NodePrioritizer
 from smartsim._core.schemas.dragon_requests import *
 from smartsim._core.schemas.dragon_responses import *
-from smartsim._core.utils.helpers import create_short_id_str
 from smartsim.status import TERMINAL_STATUSES, InvalidJobStatus, JobStatus
 
-if t.TYPE_CHECKING:
-    from smartsim._core.launcher.dragon.dragon_backend import (
-        DragonBackend,
-        ProcessGroupInfo,
-    )
-
-
-class NodeMock(MagicMock):
-    def __init__(
-        self, name: t.Optional[str] = None, num_gpus: int = 2, num_cpus: int = 8
-    ) -> None:
-        super().__init__()
-        self._mock_id = name
-        NodeMock._num_gpus = num_gpus
-        NodeMock._num_cpus = num_cpus
-
-    @property
-    def hostname(self) -> str:
-        if self._mock_id:
-            return self._mock_id
-        return create_short_id_str()
-
-    @property
-    def num_cpus(self) -> str:
-        return NodeMock._num_cpus
-
-    @property
-    def num_gpus(self) -> str:
-        return NodeMock._num_gpus
-
-    def _set_id(self, value: str) -> None:
-        self._mock_id = value
-
-    def gpus(self, parent: t.Any = None) -> t.List[str]:
-        if self._num_gpus:
-            return [f"{self.hostname}-gpu{i}" for i in range(NodeMock._num_gpus)]
-        return []
-
 
 class GroupStateMock(MagicMock):
     def Running(self) -> MagicMock:
@@ -102,59 +62,57 @@ class ProcessGroupMock(MagicMock):
     puids = [121, 122]
 
 
-def node_mock() -> NodeMock:
-    return NodeMock()
-
-
 def get_mock_backend(
-    monkeypatch: pytest.MonkeyPatch, num_gpus: int = 2
+    monkeypatch: pytest.MonkeyPatch, num_cpus: int, num_gpus: int
 ) -> "DragonBackend":
-
+    # create all the necessary namespaces as raw magic mocks
+    monkeypatch.setitem(sys.modules, "dragon.data.ddict.ddict", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.native.machine", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.native.group_state", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.native.process_group", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.native.process", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.infrastructure.connection", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.infrastructure.policy", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.infrastructure.process_desc", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.data.ddict.ddict", MagicMock())
+
+    node_list = ["node1", "node2", "node3"]
+    system_mock = MagicMock(return_value=MagicMock(nodes=node_list))
+    node_mock = lambda x: MagicMock(hostname=x, num_cpus=num_cpus, num_gpus=num_gpus)
+    process_group_mock = MagicMock(return_value=ProcessGroupMock())
     process_mock = MagicMock(returncode=0)
-    process_group_mock = MagicMock(**{"Process.return_value": ProcessGroupMock()})
-    process_module_mock = MagicMock()
-    process_module_mock.Process = process_mock
-    node_mock = NodeMock(num_gpus=num_gpus)
-    system_mock = MagicMock(nodes=["node1", "node2", "node3"])
+    policy_mock = MagicMock(return_value=MagicMock())
+    group_state_mock = GroupStateMock()
+
+    # customize members that must perform specific actions within the namespaces
     monkeypatch.setitem(
         sys.modules,
         "dragon",
         MagicMock(
             **{
-                "native.machine.Node.return_value": node_mock,
-                "native.machine.System.return_value": system_mock,
-                "native.group_state": GroupStateMock(),
-                "native.process_group.ProcessGroup.return_value": ProcessGroupMock(),
+                "native.machine.Node": node_mock,
+                "native.machine.System": system_mock,
+                "native.group_state": group_state_mock,
+                "native.process_group.ProcessGroup": process_group_mock,
+                "native.process_group.Process": process_mock,
+                "native.process.Process": process_mock,
+                "infrastructure.policy.Policy": policy_mock,
             }
         ),
     )
-    monkeypatch.setitem(
-        sys.modules,
-        "dragon.infrastructure.connection",
-        MagicMock(),
-    )
-    monkeypatch.setitem(
-        sys.modules,
-        "dragon.infrastructure.policy",
-        MagicMock(**{"Policy.return_value": MagicMock()}),
-    )
-    monkeypatch.setitem(sys.modules, "dragon.native.process", process_module_mock)
-    monkeypatch.setitem(sys.modules, "dragon.native.process_group", process_group_mock)
-
-    monkeypatch.setitem(sys.modules, "dragon.native.group_state", GroupStateMock())
-    monkeypatch.setitem(
-        sys.modules,
-        "dragon.native.machine",
-        MagicMock(
-            **{"System.return_value": system_mock, "Node.return_value": node_mock}
-        ),
-    )
-    from smartsim._core.launcher.dragon.dragon_backend import DragonBackend
 
     dragon_backend = DragonBackend(pid=99999)
-    monkeypatch.setattr(
-        dragon_backend, "_free_hosts", collections.deque(dragon_backend._hosts)
+
+    # NOTE: we're manually updating these values due to issue w/mocking namespaces
+    dragon_backend._prioritizer = NodePrioritizer(
+        [
+            MagicMock(num_cpus=num_cpus, num_gpus=num_gpus, hostname=node)
+            for node in node_list
+        ],
+        dragon_backend._queue_lock,
     )
+    dragon_backend._cpus = [num_cpus] * len(node_list)
+    dragon_backend._gpus = [num_gpus] * len(node_list)
 
     return dragon_backend
 
@@ -212,16 +170,14 @@ def set_mock_group_infos(
     }
 
     monkeypatch.setattr(dragon_backend, "_group_infos", group_infos)
-    monkeypatch.setattr(dragon_backend, "_free_hosts", collections.deque(hosts[1:3]))
-    monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: "abc123-1"})
+    monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: {"abc123-1"}})
     monkeypatch.setattr(dragon_backend, "_running_steps", ["abc123-1"])
 
     return group_infos
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
 
     handshake_req = DragonHandshakeRequest()
     handshake_resp = dragon_backend.process_request(handshake_req)
@@ -230,9 +186,8 @@ def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None:
     assert handshake_resp.dragon_pid == 99999
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -259,9 +214,9 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
 
     assert dragon_backend._running_steps == [step_id]
     assert len(dragon_backend._queued_steps) == 0
-    assert len(dragon_backend._free_hosts) == 1
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
+    assert len(dragon_backend.free_hosts) == 1
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
 
     monkeypatch.setattr(
         dragon_backend._group_infos[step_id].process_group, "status", "Running"
@@ -271,9 +226,9 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
 
     assert dragon_backend._running_steps == [step_id]
     assert len(dragon_backend._queued_steps) == 0
-    assert len(dragon_backend._free_hosts) == 1
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
+    assert len(dragon_backend.free_hosts) == 1
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
 
     dragon_backend._group_infos[step_id].status = JobStatus.CANCELLED
 
@@ -281,9 +236,8 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
     assert not dragon_backend._running_steps
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
 
     dragon_backend._shutdown_requested = True
 
@@ -309,7 +263,7 @@ def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
 
 def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None:
     """Verify that a policy is applied to a run request"""
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -325,10 +279,9 @@ def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None:
     assert run_req.policy is None
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
     """Verify that a policy is applied to a run request"""
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -356,9 +309,9 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
 
     assert dragon_backend._running_steps == [step_id]
     assert len(dragon_backend._queued_steps) == 0
-    assert len(dragon_backend._free_hosts) == 1
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
+    assert len(dragon_backend._prioritizer.unassigned()) == 1
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
 
     monkeypatch.setattr(
         dragon_backend._group_infos[step_id].process_group, "status", "Running"
@@ -368,9 +321,9 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
 
     assert dragon_backend._running_steps == [step_id]
     assert len(dragon_backend._queued_steps) == 0
-    assert len(dragon_backend._free_hosts) == 1
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
+    assert len(dragon_backend._prioritizer.unassigned()) == 1
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
 
     dragon_backend._group_infos[step_id].status = JobStatus.CANCELLED
 
@@ -378,9 +331,8 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
     assert not dragon_backend._running_steps
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
 
     group_infos = set_mock_group_infos(monkeypatch, dragon_backend)
 
@@ -395,9 +347,8 @@ def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None:
     }
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     group_infos = set_mock_group_infos(monkeypatch, dragon_backend)
 
     running_steps = [
@@ -421,10 +372,9 @@ def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None:
     assert dragon_backend._group_infos[step_id_to_stop].status == JobStatus.CANCELLED
 
     assert len(dragon_backend._allocated_hosts) == 0
-    assert len(dragon_backend._free_hosts) == 3
+    assert len(dragon_backend._prioritizer.unassigned()) == 3
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 @pytest.mark.parametrize(
     "immediate, kill_jobs, frontend_shutdown",
     [
@@ -443,7 +393,7 @@ def test_shutdown_request(
     frontend_shutdown: bool,
 ) -> None:
     monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "0")
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     monkeypatch.setattr(dragon_backend, "_cooldown_period", 1)
     set_mock_group_infos(monkeypatch, dragon_backend)
 
@@ -483,11 +433,10 @@ def test_shutdown_request(
     assert dragon_backend._has_cooled_down == kill_jobs
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 @pytest.mark.parametrize("telemetry_flag", ["0", "1"])
 def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -> None:
     monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", telemetry_flag)
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
 
     expected_cooldown = (
         2 * CONFIG.telemetry_frequency + 5 if int(telemetry_flag) > 0 else 5
@@ -499,19 +448,17 @@ def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -
         assert dragon_backend.cooldown_period == expected_cooldown
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     first_heartbeat = dragon_backend.last_heartbeat
     assert dragon_backend.current_time > first_heartbeat
     dragon_backend._heartbeat()
     assert dragon_backend.last_heartbeat > first_heartbeat
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 @pytest.mark.parametrize("num_nodes", [1, 3, 100])
 def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -524,18 +471,42 @@ def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None:
         pmi_enabled=False,
     )
 
-    assert dragon_backend._can_honor(run_req)[0] == (
-        num_nodes <= len(dragon_backend._hosts)
-    )
+    can_honor, error_msg = dragon_backend._can_honor(run_req)
+
+    nodes_in_range = num_nodes <= len(dragon_backend._hosts)
+    assert can_honor == nodes_in_range
+    assert error_msg is None if nodes_in_range else error_msg is not None
+
+
+@pytest.mark.parametrize("num_nodes", [-10, -1, 0])
+def test_can_honor_invalid_num_nodes(
+    monkeypatch: pytest.MonkeyPatch, num_nodes: int
+) -> None:
+    """Verify that requests for invalid numbers of nodes (negative, zero) are rejected"""
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
+
+    with pytest.raises(pydantic.error_wrappers.ValidationError) as ex:
+        DragonRunRequest(
+            exe="sleep",
+            exe_args=["5"],
+            path="/a/fake/path",
+            nodes=num_nodes,
+            tasks=1,
+            tasks_per_node=1,
+            env={},
+            current_env={},
+            pmi_enabled=False,
+        )
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 @pytest.mark.parametrize("affinity", [[0], [0, 1], list(range(8))])
 def test_can_honor_cpu_affinity(
     monkeypatch: pytest.MonkeyPatch, affinity: t.List[int]
 ) -> None:
     """Verify that valid CPU affinities are accepted"""
-    dragon_backend = get_mock_backend(monkeypatch)
+    num_cpus, num_gpus = 8, 0
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=num_cpus, num_gpus=num_gpus)
+
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -552,11 +523,10 @@ def test_can_honor_cpu_affinity(
     assert dragon_backend._can_honor(run_req)[0]
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_can_honor_cpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None:
     """Verify that invalid CPU affinities are NOT accepted
     NOTE: negative values are captured by the Pydantic schema"""
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -573,13 +543,15 @@ def test_can_honor_cpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) ->
     assert not dragon_backend._can_honor(run_req)[0]
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 @pytest.mark.parametrize("affinity", [[0], [0, 1]])
 def test_can_honor_gpu_affinity(
     monkeypatch: pytest.MonkeyPatch, affinity: t.List[int]
 ) -> None:
     """Verify that valid GPU affinities are accepted"""
-    dragon_backend = get_mock_backend(monkeypatch)
+
+    num_cpus, num_gpus = 8, 2
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=num_cpus, num_gpus=num_gpus)
+
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -596,11 +568,10 @@ def test_can_honor_gpu_affinity(
     assert dragon_backend._can_honor(run_req)[0]
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_can_honor_gpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None:
     """Verify that invalid GPU affinities are NOT accepted
     NOTE: negative values are captured by the Pydantic schema"""
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -617,46 +588,46 @@ def test_can_honor_gpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) ->
     assert not dragon_backend._can_honor(run_req)[0]
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_can_honor_gpu_device_not_available(monkeypatch: pytest.MonkeyPatch) -> None:
     """Verify that a request for a GPU if none exists is not accepted"""
 
     # create a mock node class that always reports no GPUs available
-    dragon_backend = get_mock_backend(monkeypatch, num_gpus=0)
-
-    run_req = DragonRunRequest(
-        exe="sleep",
-        exe_args=["5"],
-        path="/a/fake/path",
-        nodes=2,
-        tasks=1,
-        tasks_per_node=1,
-        env={},
-        current_env={},
-        pmi_enabled=False,
-        # specify GPU device w/no affinity
-        policy=DragonRunPolicy(gpu_affinity=[0]),
-    )
-
-    assert not dragon_backend._can_honor(run_req)[0]
+    with monkeypatch.context() as ctx:
+        dragon_backend = get_mock_backend(ctx, num_cpus=8, num_gpus=0)
+
+        run_req = DragonRunRequest(
+            exe="sleep",
+            exe_args=["5"],
+            path="/a/fake/path",
+            nodes=2,
+            tasks=1,
+            tasks_per_node=1,
+            env={},
+            current_env={},
+            pmi_enabled=False,
+            # specify GPU device w/no affinity
+            policy=DragonRunPolicy(gpu_affinity=[0]),
+        )
+        can_honor, _ = dragon_backend._can_honor(run_req)
+        assert not can_honor
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     step_id = next(dragon_backend._step_ids)
 
     assert step_id.endswith("0")
     assert step_id != next(dragon_backend._step_ids)
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_view(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     set_mock_group_infos(monkeypatch, dragon_backend)
     hosts = dragon_backend.hosts
+    dragon_backend._prioritizer.increment(hosts[0])
 
-    expected_message = textwrap.dedent(f"""\
+    expected_msg = textwrap.dedent(
+        f"""\
         Dragon server backend update
         | Host   |  Status  |
         |--------|----------|
@@ -664,15 +635,120 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None:
         | {hosts[1]} |   Free   |
         | {hosts[2]} |   Free   |
         | Step     | Status       | Hosts           |  Return codes  |  Num procs  |
-        |----------|--------------|-------------|----------------|-------------|
+        |----------|--------------|-----------------|----------------|-------------|
         | abc123-1 | Running      | {hosts[0]}         |                |      1      |
         | del999-2 | Cancelled    | {hosts[1]}         |       -9       |      1      |
         | c101vz-3 | Completed    | {hosts[1]},{hosts[2]} |       0        |      2      |
         | 0ghjk1-4 | Failed       | {hosts[2]}         |       -1       |      1      |
-        | ljace0-5 | NeverStarted |                 |                |      0      |""")
+        | ljace0-5 | NeverStarted |                 |                |      0      |"""
+    )
 
     # get rid of white space to make the comparison easier
     actual_msg = dragon_backend.status_message.replace(" ", "")
-    expected_message = expected_message.replace(" ", "")
+    expected_msg = expected_msg.replace(" ", "")
+
+    # ignore dashes in separators (hostname changes may cause column expansion)
+    while actual_msg.find("--") > -1:
+        actual_msg = actual_msg.replace("--", "-")
+    while expected_msg.find("--") > -1:
+        expected_msg = expected_msg.replace("--", "-")
+
+    assert actual_msg == expected_msg
+
+
+def test_can_honor_hosts_unavailable_hosts(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Verify that requesting nodes with invalid names causes number of available
+    nodes check to fail due to valid # of named nodes being under num_nodes"""
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
+
+    # let's supply 2 invalid and 1 valid hostname
+    actual_hosts = list(dragon_backend._hosts)
+    actual_hosts[0] = f"x{actual_hosts[0]}"
+    actual_hosts[1] = f"x{actual_hosts[1]}"
+
+    host_list = ",".join(actual_hosts)
+
+    run_req = DragonRunRequest(
+        exe="sleep",
+        exe_args=["5"],
+        path="/a/fake/path",
+        nodes=2,  # <----- requesting 2 of 3 available nodes
+        hostlist=host_list,  # <--- only one valid name available
+        tasks=1,
+        tasks_per_node=1,
+        env={},
+        current_env={},
+        pmi_enabled=False,
+        policy=DragonRunPolicy(),
+    )
+
+    can_honor, error_msg = dragon_backend._can_honor(run_req)
+
+    # confirm the failure is indicated
+    assert not can_honor
+    # confirm failure message indicates number of nodes requested as cause
+    assert "named hosts" in error_msg
+
+
+def test_can_honor_hosts_unavailable_hosts_ok(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Verify that requesting nodes with invalid names causes number of available
+    nodes check to be reduced but still passes if enough valid named nodes are passed"""
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
+
+    # let's supply 2 valid and 1 invalid hostname
+    actual_hosts = list(dragon_backend._hosts)
+    actual_hosts[0] = f"x{actual_hosts[0]}"
+
+    host_list = ",".join(actual_hosts)
+
+    run_req = DragonRunRequest(
+        exe="sleep",
+        exe_args=["5"],
+        path="/a/fake/path",
+        nodes=2,  # <----- requesting 2 of 3 available nodes
+        hostlist=host_list,  # <--- two valid names are available
+        tasks=1,
+        tasks_per_node=1,
+        env={},
+        current_env={},
+        pmi_enabled=False,
+        policy=DragonRunPolicy(),
+    )
+
+    can_honor, error_msg = dragon_backend._can_honor(run_req)
+
+    # confirm the failure is indicated
+    assert can_honor, error_msg
+    # confirm failure message indicates number of nodes requested as cause
+    assert error_msg is None, error_msg
+
+
+def test_can_honor_hosts_1_hosts_requested(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Verify that requesting nodes with invalid names causes number of available
+    nodes check to be reduced but still passes if enough valid named nodes are passed"""
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
+
+    # let's supply 2 valid and 1 invalid hostname
+    actual_hosts = list(dragon_backend._hosts)
+    actual_hosts[0] = f"x{actual_hosts[0]}"
+
+    host_list = ",".join(actual_hosts)
+
+    run_req = DragonRunRequest(
+        exe="sleep",
+        exe_args=["5"],
+        path="/a/fake/path",
+        nodes=1,  # <----- requesting 0 nodes - should be ignored
+        hostlist=host_list,  # <--- two valid names are available
+        tasks=1,
+        tasks_per_node=1,
+        env={},
+        current_env={},
+        pmi_enabled=False,
+        policy=DragonRunPolicy(),
+    )
+
+    can_honor, error_msg = dragon_backend._can_honor(run_req)
 
-    assert actual_msg == expected_message
+    # confirm the failure is indicated
+    assert can_honor, error_msg
diff --git a/tests/_legacy/test_dragon_run_request_nowlm.py b/tests/_legacy/test_dragon_run_request_nowlm.py
index 2b5526c69e..98f5b706da 100644
--- a/tests/_legacy/test_dragon_run_request_nowlm.py
+++ b/tests/_legacy/test_dragon_run_request_nowlm.py
@@ -101,5 +101,5 @@ def test_run_request_with_negative_affinity(
             ),
         )
 
-    assert f"{device}_affinity" in str(ex.value.args[0])
-    assert "NumberNotGeError" in str(ex.value.args[0])
+    assert f"{device}_affinity" in str(ex.value)
+    assert "greater than or equal to 0" in str(ex.value)
diff --git a/tests/_legacy/test_dragon_step.py b/tests/_legacy/test_dragon_step.py
index 17279a33c6..3dbdf114ea 100644
--- a/tests/_legacy/test_dragon_step.py
+++ b/tests/_legacy/test_dragon_step.py
@@ -73,12 +73,18 @@ def dragon_batch_step(test_dir: str) -> DragonBatchStep:
     cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]]
     gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]]
 
+    # specify 3 hostnames to select from but require only 2 nodes
+    num_nodes = 2
+    hostnames = ["host1", "host2", "host3"]
+
     # assign some unique affinities to each run setting instance
     for index, rs in enumerate(settings):
         if gpu_affinities[index]:
             rs.set_node_feature("gpu")
         rs.set_cpu_affinity(cpu_affinities[index])
         rs.set_gpu_affinity(gpu_affinities[index])
+        rs.set_hostlist(hostnames)
+        rs.set_nodes(num_nodes)
 
     steps = list(
         DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings)
@@ -374,6 +380,11 @@ def test_dragon_batch_step_write_request_file(
     cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]]
     gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]]
 
+    hostnames = ["host1", "host2", "host3"]
+    num_nodes = 2
+
+    # parse requests file path from  the launch command
+    # e.g. dragon python <batch-script-path>
     launch_cmd = dragon_batch_step.get_launch_cmd()
     requests_file = get_request_path_from_batch_script(launch_cmd)
 
@@ -392,3 +403,5 @@ def test_dragon_batch_step_write_request_file(
         assert run_request
         assert run_request.policy.cpu_affinity == cpu_affinities[index]
         assert run_request.policy.gpu_affinity == gpu_affinities[index]
+        assert run_request.nodes == num_nodes
+        assert run_request.hostlist == ",".join(hostnames)
diff --git a/tests/_legacy/test_model.py b/tests/_legacy/test_model.py
index 5adf8070f1..f8a84deb8d 100644
--- a/tests/_legacy/test_model.py
+++ b/tests/_legacy/test_model.py
@@ -24,8 +24,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import typing as t
 from uuid import uuid4
 
+import numpy as np
 import pytest
 
 from smartsim import Experiment
@@ -35,7 +37,10 @@
 from smartsim.entity import Application
 from smartsim.error import EntityExistsError, SSUnsupportedError
 from smartsim.settings import RunSettings, SbatchSettings, SrunSettings
-from smartsim.settings.mpiSettings import _BaseMPISettings
+
+# from smartsim.settings.mpiSettings import
+
+_BaseMPISettings = t.Any
 
 # The tests in this file belong to the slow_tests group
 pytestmark = pytest.mark.slow_tests
diff --git a/tests/_legacy/test_preview.py b/tests/_legacy/test_preview.py
index 82d443fb3e..6f029aab8f 100644
--- a/tests/_legacy/test_preview.py
+++ b/tests/_legacy/test_preview.py
@@ -359,7 +359,7 @@ def test_model_preview_properties(test_dir, wlmutils):
     assert hw_rs == hello_world_model.run_settings.exe_args[0]
     assert None == hello_world_model.batch_settings
     assert "port" in list(hello_world_model.params.items())[0]
-    assert hw_port in list(hello_world_model.params.items())[0]
+    assert str(hw_port) in list(hello_world_model.params.items())[0]
     assert "password" in list(hello_world_model.params.items())[1]
     assert hw_password in list(hello_world_model.params.items())[1]
 
@@ -983,7 +983,7 @@ def test_preview_active_infrastructure_feature_store_error(
     exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher)
 
     monkeypatch.setattr(
-        smartsim.database.orchestrator.FeatureStore, "is_active", lambda x: True
+        smartsim.database.feature_store.FeatureStore, "is_active", lambda x: True
     )
 
     orc = exp.create_feature_store(
diff --git a/tests/_legacy/test_smartredis.py b/tests/_legacy/test_smartredis.py
index f09cc8ca89..d4ac0ceebc 100644
--- a/tests/_legacy/test_smartredis.py
+++ b/tests/_legacy/test_smartredis.py
@@ -27,10 +27,7 @@
 
 import pytest
 
-from smartsim import Experiment
-from smartsim._core.utils import installed_redisai_backends
 from smartsim.builders import Ensemble
-from smartsim.database import FeatureStore
 from smartsim.entity import Application
 from smartsim.status import JobStatus
 
@@ -51,7 +48,9 @@
 except ImportError:
     shouldrun = False
 
-torch_available = "torch" in installed_redisai_backends()
+torch_available = (
+    "torch" in []
+)  # todo: update test to replace installed_redisai_backends()
 
 shouldrun &= torch_available
 
diff --git a/tests/backends/test_ml_init.py b/tests/backends/test_ml_init.py
index 445ee8c444..7f5c6f9864 100644
--- a/tests/backends/test_ml_init.py
+++ b/tests/backends/test_ml_init.py
@@ -28,7 +28,13 @@
 
 import pytest
 
-pytestmark = [pytest.mark.group_a, pytest.mark.group_b, pytest.mark.slow_tests]
+try:
+    import tensorflow
+    import torch
+except:
+    pytestmark = pytest.mark.skip("tensorflow or torch were not availalble")
+else:
+    pytestmark = [pytest.mark.group_a, pytest.mark.group_b, pytest.mark.slow_tests]
 
 
 def test_import_ss_ml(monkeypatch):
diff --git a/tests/dragon_wlm/__init__.py b/tests/dragon_wlm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dragon_wlm/channel.py b/tests/dragon_wlm/channel.py
new file mode 100644
index 0000000000..4c46359c2d
--- /dev/null
+++ b/tests/dragon_wlm/channel.py
@@ -0,0 +1,125 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import base64
+import pathlib
+import threading
+import typing as t
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class FileSystemCommChannel(CommChannelBase):
+    """Passes messages by writing to a file"""
+
+    def __init__(self, key: pathlib.Path) -> None:
+        """Initialize the FileSystemCommChannel instance.
+
+        :param key: a path to the root directory of the feature store
+        """
+        self._lock = threading.RLock()
+
+        super().__init__(key.as_posix())
+        self._file_path = key
+
+        if not self._file_path.parent.exists():
+            self._file_path.parent.mkdir(parents=True)
+
+        self._file_path.touch()
+
+    def send(self, value: bytes, timeout: float = 0) -> None:
+        """Send a message throuh the underlying communication channel.
+
+        :param value: The value to send
+        :param timeout: maximum time to wait (in seconds) for messages to send
+        """
+        with self._lock:
+            # write as text so we can add newlines as delimiters
+            with open(self._file_path, "a") as fp:
+                encoded_value = base64.b64encode(value).decode("utf-8")
+                fp.write(f"{encoded_value}\n")
+                logger.debug(f"FileSystemCommChannel {self._file_path} sent message")
+
+    def recv(self, timeout: float = 0) -> t.List[bytes]:
+        """Receives message(s) through the underlying communication channel.
+
+        :param timeout: maximum time to wait (in seconds) for messages to arrive
+        :returns: the received message
+        :raises SmartSimError: if the descriptor points to a missing file
+        """
+        with self._lock:
+            messages: t.List[bytes] = []
+            if not self._file_path.exists():
+                raise SmartSimError("Empty channel")
+
+            # read as text so we can split on newlines
+            with open(self._file_path, "r") as fp:
+                lines = fp.readlines()
+
+            if lines:
+                line = lines.pop(0)
+                event_bytes = base64.b64decode(line.encode("utf-8"))
+                messages.append(event_bytes)
+
+            self.clear()
+
+            # remove the first message only, write remainder back...
+            if len(lines) > 0:
+                with open(self._file_path, "w") as fp:
+                    fp.writelines(lines)
+
+                logger.debug(
+                    f"FileSystemCommChannel {self._file_path} received message"
+                )
+
+            return messages
+
+    def clear(self) -> None:
+        """Create an empty file for events."""
+        if self._file_path.exists():
+            self._file_path.unlink()
+        self._file_path.touch()
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "FileSystemCommChannel":
+        """A factory method that creates an instance from a descriptor string.
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemCommChannel
+        """
+        try:
+            path = pathlib.Path(descriptor)
+            return FileSystemCommChannel(path)
+        except:
+            logger.warning(f"failed to create fs comm channel: {descriptor}")
+            raise
diff --git a/tests/dragon_wlm/conftest.py b/tests/dragon_wlm/conftest.py
new file mode 100644
index 0000000000..bdec40b7e5
--- /dev/null
+++ b/tests/dragon_wlm/conftest.py
@@ -0,0 +1,126 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import annotations
+
+import os
+import socket
+import typing as t
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+# isort: off
+import dragon.data.ddict.ddict as dragon_ddict
+import dragon.infrastructure.policy as dragon_policy
+import dragon.infrastructure.process_desc as dragon_process_desc
+import dragon.native.process as dragon_process
+
+from dragon.fli import FLInterface
+
+# isort: on
+
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.storage import dragon_util
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+@pytest.fixture(scope="module")
+def the_storage() -> dragon_ddict.DDict:
+    """Fixture to instantiate a dragon distributed dictionary."""
+    return dragon_util.create_ddict(1, 2, 32 * 1024**2)
+
+
+@pytest.fixture(scope="module")
+def the_worker_channel() -> DragonFLIChannel:
+    """Fixture to create a valid descriptor for a worker channel
+    that can be attached to."""
+    channel_ = create_local()
+    fli_ = FLInterface(main_ch=channel_, manager_ch=None)
+    comm_channel = DragonFLIChannel(fli_)
+    return comm_channel
+
+
+@pytest.fixture(scope="module")
+def the_backbone(
+    the_storage: t.Any, the_worker_channel: DragonFLIChannel
+) -> BackboneFeatureStore:
+    """Fixture to create a distributed dragon dictionary and wrap it
+    in a BackboneFeatureStore.
+
+    :param the_storage: The dragon storage engine to use
+    :param the_worker_channel: Pre-configured worker channel
+    """
+
+    backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True)
+    backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor
+
+    return backbone
+
+
+@pytest.fixture(scope="module")
+def backbone_descriptor(the_backbone: BackboneFeatureStore) -> str:
+    # create a shared backbone featurestore
+    return the_backbone.descriptor
+
+
+def function_as_dragon_proc(
+    entrypoint_fn: t.Callable[[t.Any], None],
+    args: t.List[t.Any],
+    cpu_affinity: t.List[int],
+    gpu_affinity: t.List[int],
+) -> dragon_process.Process:
+    """Execute a function as an independent dragon process.
+
+    :param entrypoint_fn: The function to execute
+    :param args: The arguments for the entrypoint function
+    :param cpu_affinity: The cpu affinity for the process
+    :param gpu_affinity: The gpu affinity for the process
+    :returns: The dragon process handle
+    """
+    options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
+    local_policy = dragon_policy.Policy(
+        placement=dragon_policy.Policy.Placement.HOST_NAME,
+        host_name=socket.gethostname(),
+        cpu_affinity=cpu_affinity,
+        gpu_affinity=gpu_affinity,
+    )
+    return dragon_process.Process(
+        target=entrypoint_fn,
+        args=args,
+        cwd=os.getcwd(),
+        policy=local_policy,
+        options=options,
+        stderr=dragon_process.Popen.STDOUT,
+        stdout=dragon_process.Popen.STDOUT,
+    )
diff --git a/tests/dragon_wlm/feature_store.py b/tests/dragon_wlm/feature_store.py
new file mode 100644
index 0000000000..d06b0b334e
--- /dev/null
+++ b/tests/dragon_wlm/feature_store.py
@@ -0,0 +1,152 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+import typing as t
+
+import smartsim.error as sse
+from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class MemoryFeatureStore(FeatureStore):
+    """A feature store with values persisted only in local memory"""
+
+    def __init__(
+        self, storage: t.Optional[t.Dict[str, t.Union[str, bytes]]] = None
+    ) -> None:
+        """Initialize the MemoryFeatureStore instance"""
+        super().__init__("in-memory-fs")
+        if storage is None:
+            storage = {"_": "abc"}
+        self._storage = storage
+
+    def _get(self, key: str) -> t.Union[str, bytes]:
+        """Retrieve a value from the underlying storage mechanism
+
+        :param key: The unique key that identifies the resource
+        :returns: the value identified by the key
+        :raises KeyError: if the key has not been used to store a value"""
+        return self._storage[key]
+
+    def _set(self, key: str, value: t.Union[str, bytes]) -> None:
+        """Store a value into the underlying storage mechanism
+
+        :param key: The unique key that identifies the resource
+        :param value: The value to store
+        :returns: the value identified by the key
+        :raises KeyError: if the key has not been used to store a value"""
+        self._storage[key] = value
+
+    def _contains(self, key: str) -> bool:
+        """Determine if the storage mechanism contains a given key
+
+        :param key: The unique key that identifies the resource
+        :returns: True if the key is defined, False otherwise"""
+        return key in self._storage
+
+
+class FileSystemFeatureStore(FeatureStore):
+    """Alternative feature store implementation for testing. Stores all
+    data on the file system"""
+
+    def __init__(self, storage_dir: t.Union[pathlib.Path, str]) -> None:
+        """Initialize the FileSystemFeatureStore instance
+
+        :param storage_dir: (optional) root directory to store all data relative to"""
+        if isinstance(storage_dir, str):
+            storage_dir = pathlib.Path(storage_dir)
+        self._storage_dir = storage_dir
+        super().__init__(storage_dir.as_posix())
+
+    def _get(self, key: str) -> t.Union[str, bytes]:
+        """Retrieve a value from the underlying storage mechanism
+
+        :param key: The unique key that identifies the resource
+        :returns: the value identified by the key
+        :raises KeyError: if the key has not been used to store a value"""
+        path = self._key_path(key)
+        if not path.exists():
+            raise sse.SmartSimError(f"{path} not found in feature store")
+        return path.read_bytes()
+
+    def _set(self, key: str, value: t.Union[str, bytes]) -> None:
+        """Store a value into the underlying storage mechanism
+
+        :param key: The unique key that identifies the resource
+        :param value: The value to store
+        :returns: the value identified by the key
+        :raises KeyError: if the key has not been used to store a value"""
+        path = self._key_path(key, create=True)
+        if isinstance(value, str):
+            value = value.encode("utf-8")
+        path.write_bytes(value)
+
+    def _contains(self, key: str) -> bool:
+        """Determine if the storage mechanism contains a given key
+
+        :param key: The unique key that identifies the resource
+        :returns: True if the key is defined, False otherwise"""
+        path = self._key_path(key)
+        return path.exists()
+
+    def _key_path(self, key: str, create: bool = False) -> pathlib.Path:
+        """Given a key, return a path that is optionally combined with a base
+        directory used by the FileSystemFeatureStore.
+
+        :param key: Unique key of an item to retrieve from the feature store"""
+        value = pathlib.Path(key)
+
+        if self._storage_dir is not None:
+            value = self._storage_dir / key
+
+        if create:
+            value.parent.mkdir(parents=True, exist_ok=True)
+
+        return value
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "FileSystemFeatureStore":
+        """A factory method that creates an instance from a descriptor string
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemFeatureStore"""
+        try:
+            path = pathlib.Path(descriptor)
+            path.mkdir(parents=True, exist_ok=True)
+            if not path.is_dir():
+                raise ValueError("FileSystemFeatureStore requires a directory path")
+            if not path.exists():
+                path.mkdir(parents=True, exist_ok=True)
+            return FileSystemFeatureStore(path)
+        except:
+            logger.error(f"Error while creating FileSystemFeatureStore: {descriptor}")
+            raise
diff --git a/tests/dragon_wlm/test_core_machine_learning_worker.py b/tests/dragon_wlm/test_core_machine_learning_worker.py
new file mode 100644
index 0000000000..f9295d9e86
--- /dev/null
+++ b/tests/dragon_wlm/test_core_machine_learning_worker.py
@@ -0,0 +1,377 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+import time
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+import torch
+
+import smartsim.error as sse
+from smartsim._core.mli.infrastructure.storage.feature_store import ModelKey, TensorKey
+from smartsim._core.mli.infrastructure.worker.worker import (
+    InferenceRequest,
+    MachineLearningWorkerCore,
+    RequestBatch,
+    TransformOutputResult,
+)
+
+from .feature_store import FileSystemFeatureStore, MemoryFeatureStore
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+# retrieved from pytest fixtures
+is_dragon = (
+    pytest.test_launcher == "dragon" if hasattr(pytest, "test_launcher") else False
+)
+torch_available = (
+    "torch" in []
+)  # todo: update test to replace installed_redisai_backends()
+
+
+@pytest.fixture
+def persist_torch_model(test_dir: str) -> pathlib.Path:
+    ts_start = time.time_ns()
+    print("Starting model file creation...")
+    test_path = pathlib.Path(test_dir)
+    model_path = test_path / "basic.pt"
+
+    model = torch.nn.Linear(2, 1)
+    torch.save(model, model_path)
+    ts_end = time.time_ns()
+
+    ts_elapsed = (ts_end - ts_start) / 1000000000
+    print(f"Model file creation took {ts_elapsed} seconds")
+    return model_path
+
+
+@pytest.fixture
+def persist_torch_tensor(test_dir: str) -> pathlib.Path:
+    ts_start = time.time_ns()
+    print("Starting model file creation...")
+    test_path = pathlib.Path(test_dir)
+    file_path = test_path / "tensor.pt"
+
+    tensor = torch.randn((100, 100, 2))
+    torch.save(tensor, file_path)
+    ts_end = time.time_ns()
+
+    ts_elapsed = (ts_end - ts_start) / 1000000000
+    print(f"Tensor file creation took {ts_elapsed} seconds")
+    return file_path
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_model_disk(persist_torch_model: pathlib.Path, test_dir: str) -> None:
+    """Verify that the ML worker successfully retrieves a model
+    when given a valid (file system) key"""
+    worker = MachineLearningWorkerCore
+    key = str(persist_torch_model)
+    feature_store = FileSystemFeatureStore(test_dir)
+    fsd = feature_store.descriptor
+    feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes()
+
+    model_key = ModelKey(key=key, descriptor=fsd)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_model(batch, {fsd: feature_store})
+    assert fetch_result.model_bytes
+    assert fetch_result.model_bytes == persist_torch_model.read_bytes()
+
+
+def test_fetch_model_disk_missing() -> None:
+    """Verify that the ML worker fails to retrieves a model
+    when given an invalid (file system) key"""
+    worker = MachineLearningWorkerCore
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    key = "/path/that/doesnt/exist"
+
+    model_key = ModelKey(key=key, descriptor=fsd)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
+
+    with pytest.raises(sse.SmartSimError) as ex:
+        worker.fetch_model(batch, {fsd: feature_store})
+
+    # ensure the error message includes key-identifying information
+    assert key in ex.value.args[0]
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves a model
+    when given a valid (file system) key"""
+    worker = MachineLearningWorkerCore
+
+    # create a key to retrieve from the feature store
+    key = "test-model"
+
+    # put model bytes into the feature store
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    feature_store[key] = persist_torch_model.read_bytes()
+
+    model_key = ModelKey(key=key, descriptor=feature_store.descriptor)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_model(batch, {fsd: feature_store})
+    assert fetch_result.model_bytes
+    assert fetch_result.model_bytes == persist_torch_model.read_bytes()
+
+
+def test_fetch_model_feature_store_missing() -> None:
+    """Verify that the ML worker fails to retrieves a model
+    when given an invalid (feature store) key"""
+    worker = MachineLearningWorkerCore
+
+    key = "some-key"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    model_key = ModelKey(key=key, descriptor=feature_store.descriptor)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
+
+    # todo: consider that raising this exception shows impl. replace...
+    with pytest.raises(sse.SmartSimError) as ex:
+        worker.fetch_model(batch, {fsd: feature_store})
+
+    # ensure the error message includes key-identifying information
+    assert key in ex.value.args[0]
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves a model
+    when given a valid (file system) key"""
+    worker = MachineLearningWorkerCore
+
+    key = "test-model"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    feature_store[key] = persist_torch_model.read_bytes()
+
+    model_key = ModelKey(key=key, descriptor=feature_store.descriptor)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_model(batch, {fsd: feature_store})
+    assert fetch_result.model_bytes
+    assert fetch_result.model_bytes == persist_torch_model.read_bytes()
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves a tensor/input
+    when given a valid (file system) key"""
+    tensor_name = str(persist_torch_tensor)
+
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    request = InferenceRequest(input_keys=[TensorKey(key=tensor_name, descriptor=fsd)])
+
+    model_key = ModelKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    worker = MachineLearningWorkerCore
+
+    feature_store[tensor_name] = persist_torch_tensor.read_bytes()
+
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+    assert fetch_result[0].inputs is not None
+
+
+def test_fetch_input_disk_missing() -> None:
+    """Verify that the ML worker fails to retrieves a tensor/input
+    when given an invalid (file system) key"""
+    worker = MachineLearningWorkerCore
+
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    key = "/path/that/doesnt/exist"
+
+    request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)])
+
+    model_key = ModelKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    with pytest.raises(sse.SmartSimError) as ex:
+        worker.fetch_inputs(batch, {fsd: feature_store})
+
+    # ensure the error message includes key-identifying information
+    assert key[0] in ex.value.args[0]
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves a tensor/input
+    when given a valid (feature store) key"""
+    worker = MachineLearningWorkerCore
+
+    tensor_name = "test-tensor"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    request = InferenceRequest(input_keys=[TensorKey(key=tensor_name, descriptor=fsd)])
+
+    # put model bytes into the feature store
+    feature_store[tensor_name] = persist_torch_tensor.read_bytes()
+
+    model_key = ModelKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+    assert fetch_result[0].inputs
+    assert (
+        list(fetch_result[0].inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10]
+    )
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves multiple tensor/input
+    when given a valid collection of (feature store) keys"""
+    worker = MachineLearningWorkerCore
+
+    tensor_name = "test-tensor"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    # put model bytes into the feature store
+    body1 = persist_torch_tensor.read_bytes()
+    feature_store[tensor_name + "1"] = body1
+
+    body2 = b"abcdefghijklmnopqrstuvwxyz"
+    feature_store[tensor_name + "2"] = body2
+
+    body3 = b"mnopqrstuvwxyzabcdefghijkl"
+    feature_store[tensor_name + "3"] = body3
+
+    request = InferenceRequest(
+        input_keys=[
+            TensorKey(key=tensor_name + "1", descriptor=fsd),
+            TensorKey(key=tensor_name + "2", descriptor=fsd),
+            TensorKey(key=tensor_name + "3", descriptor=fsd),
+        ]
+    )
+
+    model_key = ModelKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+
+    raw_bytes = list(fetch_result[0].inputs)
+    assert raw_bytes
+    assert raw_bytes[0][:10] == persist_torch_tensor.read_bytes()[:10]
+    assert raw_bytes[1][:10] == body2[:10]
+    assert raw_bytes[2][:10] == body3[:10]
+
+
+def test_fetch_input_feature_store_missing() -> None:
+    """Verify that the ML worker fails to retrieves a tensor/input
+    when given an invalid (feature store) key"""
+    worker = MachineLearningWorkerCore
+
+    key = "bad-key"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)])
+
+    model_key = ModelKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    with pytest.raises(sse.SmartSimError) as ex:
+        worker.fetch_inputs(batch, {fsd: feature_store})
+
+    # ensure the error message includes key-identifying information
+    assert key in ex.value.args[0]
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves a tensor/input
+    when given a valid (file system) key"""
+    worker = MachineLearningWorkerCore
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    key = "test-model"
+    feature_store[key] = persist_torch_tensor.read_bytes()
+    request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)])
+
+    model_key = ModelKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+    assert fetch_result[0].inputs is not None
+
+
+def test_place_outputs() -> None:
+    """Verify outputs are shared using the feature store"""
+    worker = MachineLearningWorkerCore
+
+    key_name = "test-model"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    # create a key to retrieve from the feature store
+    keys = [
+        TensorKey(key=key_name + "1", descriptor=fsd),
+        TensorKey(key=key_name + "2", descriptor=fsd),
+        TensorKey(key=key_name + "3", descriptor=fsd),
+    ]
+    data = [b"abcdef", b"ghijkl", b"mnopqr"]
+
+    for fsk, v in zip(keys, data):
+        feature_store[fsk.key] = v
+
+    request = InferenceRequest(output_keys=keys)
+    transform_result = TransformOutputResult(data, [1], "c", "float32")
+
+    worker.place_output(request, transform_result, {fsd: feature_store})
+
+    for i in range(3):
+        assert feature_store[keys[i].key] == data[i]
+
+
+@pytest.mark.parametrize(
+    "key, descriptor",
+    [
+        pytest.param("", "desc", id="invalid key"),
+        pytest.param("key", "", id="invalid descriptor"),
+    ],
+)
+def test_invalid_tensorkey(key, descriptor) -> None:
+    with pytest.raises(ValueError):
+        fsk = TensorKey(key, descriptor)
diff --git a/tests/dragon_wlm/test_device_manager.py b/tests/dragon_wlm/test_device_manager.py
new file mode 100644
index 0000000000..d270e921cb
--- /dev/null
+++ b/tests/dragon_wlm/test_device_manager.py
@@ -0,0 +1,186 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.infrastructure.control.device_manager import (
+    DeviceManager,
+    WorkerDevice,
+)
+from smartsim._core.mli.infrastructure.storage.feature_store import (
+    FeatureStore,
+    ModelKey,
+    TensorKey,
+)
+from smartsim._core.mli.infrastructure.worker.worker import (
+    ExecuteResult,
+    FetchInputResult,
+    FetchModelResult,
+    InferenceRequest,
+    LoadModelResult,
+    MachineLearningWorkerBase,
+    RequestBatch,
+    TransformInputResult,
+    TransformOutputResult,
+)
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+class MockWorker(MachineLearningWorkerBase):
+    @staticmethod
+    def fetch_model(
+        batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore]
+    ) -> FetchModelResult:
+        if batch.has_raw_model:
+            return FetchModelResult(batch.raw_model)
+        return FetchModelResult(b"fetched_model")
+
+    @staticmethod
+    def load_model(
+        batch: RequestBatch, fetch_result: FetchModelResult, device: str
+    ) -> LoadModelResult:
+        return LoadModelResult(fetch_result.model_bytes)
+
+    @staticmethod
+    def transform_input(
+        batch: RequestBatch,
+        fetch_results: list[FetchInputResult],
+        mem_pool: "MemoryPool",
+    ) -> TransformInputResult:
+        return TransformInputResult(b"result", [slice(0, 1)], [[1, 2]], ["float32"])
+
+    @staticmethod
+    def execute(
+        batch: RequestBatch,
+        load_result: LoadModelResult,
+        transform_result: TransformInputResult,
+        device: str,
+    ) -> ExecuteResult:
+        return ExecuteResult(b"result", [slice(0, 1)])
+
+    @staticmethod
+    def transform_output(
+        batch: RequestBatch, execute_result: ExecuteResult
+    ) -> t.List[TransformOutputResult]:
+        return [TransformOutputResult(b"result", None, "c", "float32")]
+
+
+def test_worker_device():
+    worker_device = WorkerDevice("gpu:0")
+    assert worker_device.name == "gpu:0"
+
+    model_key = "my_model_key"
+    model = b"the model"
+
+    worker_device.add_model(model_key, model)
+
+    assert model_key in worker_device
+    assert worker_device.get_model(model_key) == model
+    worker_device.remove_model(model_key)
+
+    assert model_key not in worker_device
+
+
+def test_device_manager_model_in_request():
+
+    worker_device = WorkerDevice("gpu:0")
+    device_manager = DeviceManager(worker_device)
+
+    worker = MockWorker()
+
+    tensor_key = TensorKey(key="key", descriptor="desc")
+    output_key = TensorKey(key="key", descriptor="desc")
+    model_key = ModelKey(key="model key", descriptor="desc")
+
+    request = InferenceRequest(
+        model_key=model_key,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=b"raw model",
+        batch_size=0,
+    )
+
+    request_batch = RequestBatch(
+        [request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
+        model_id=model_key,
+    )
+
+    with device_manager.get_device(
+        worker=worker, batch=request_batch, feature_stores={}
+    ) as returned_device:
+
+        assert returned_device == worker_device
+        assert worker_device.get_model(model_key.key) == b"raw model"
+
+    assert model_key.key not in worker_device
+
+
+def test_device_manager_model_key():
+
+    worker_device = WorkerDevice("gpu:0")
+    device_manager = DeviceManager(worker_device)
+
+    worker = MockWorker()
+
+    tensor_key = TensorKey(key="key", descriptor="desc")
+    output_key = TensorKey(key="key", descriptor="desc")
+    model_key = ModelKey(key="model key", descriptor="desc")
+
+    request = InferenceRequest(
+        model_key=model_key,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=None,
+        batch_size=0,
+    )
+
+    request_batch = RequestBatch(
+        [request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
+        model_id=model_key,
+    )
+
+    with device_manager.get_device(
+        worker=worker, batch=request_batch, feature_stores={}
+    ) as returned_device:
+
+        assert returned_device == worker_device
+        assert worker_device.get_model(model_key.key) == b"fetched_model"
+
+    assert model_key.key in worker_device
diff --git a/tests/dragon_wlm/test_dragon_backend.py b/tests/dragon_wlm/test_dragon_backend.py
new file mode 100644
index 0000000000..dc98f5de75
--- /dev/null
+++ b/tests/dragon_wlm/test_dragon_backend.py
@@ -0,0 +1,308 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import time
+import uuid
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+
+from smartsim._core.launcher.dragon.dragon_backend import DragonBackend
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.infrastructure.comm.event import (
+    OnCreateConsumer,
+    OnShutdownRequested,
+)
+from smartsim._core.mli.infrastructure.control.listener import (
+    ConsumerRegistrationListener,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim.log import get_logger
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+logger = get_logger(__name__)
+
+
+@pytest.fixture(scope="module")
+def the_backend() -> DragonBackend:
+    return DragonBackend(pid=9999)
+
+
+@pytest.mark.skip("Test is unreliable on build agent and may hang. TODO: Fix")
+def test_dragonbackend_start_listener(the_backend: DragonBackend):
+    """Verify the background process listening to consumer registration events
+    is up and processing messages as expected."""
+
+    # We need to let the backend create the backbone to continue
+    backbone = the_backend._create_backbone()
+    backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS)
+    backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
+
+    os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor
+
+    with pytest.raises(KeyError) as ex:
+        # we expect the value of the consumer to be empty until
+        # the listener start-up completes.
+        backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
+
+    assert "not found" in ex.value.args[0]
+
+    drg_process = the_backend.start_event_listener(cpu_affinity=[], gpu_affinity=[])
+
+    # # confirm there is a process still running
+    logger.info(f"Dragon process started: {drg_process}")
+    assert drg_process is not None, "Backend was unable to start event listener"
+    assert drg_process.puid != 0, "Process unique ID is empty"
+    assert drg_process.returncode is None, "Listener terminated early"
+
+    # wait for the event listener to come up
+    try:
+        config = backbone.wait_for(
+            [BackboneFeatureStore.MLI_REGISTRAR_CONSUMER], timeout=30
+        )
+        # verify result was in the returned configuration map
+        assert config[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
+    except Exception:
+        raise KeyError(
+            f"Unable to locate {BackboneFeatureStore.MLI_REGISTRAR_CONSUMER}"
+            "in the backbone"
+        )
+
+    # wait_for ensures the normal retrieval will now work, error-free
+    descriptor = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
+    assert descriptor is not None
+
+    # register a new listener channel
+    comm_channel = DragonCommChannel.from_descriptor(descriptor)
+    mock_descriptor = str(uuid.uuid4())
+    event = OnCreateConsumer("test_dragonbackend_start_listener", mock_descriptor, [])
+
+    event_bytes = bytes(event)
+    comm_channel.send(event_bytes)
+
+    subscriber_list = []
+
+    # Give the channel time to write the message and the listener time to handle it
+    for i in range(20):
+        time.sleep(1)
+        # Retrieve the subscriber list from the backbone and verify it is updated
+        if subscriber_list := backbone.notification_channels:
+            logger.debug(f"The subscriber list was populated after {i} iterations")
+            break
+
+    assert mock_descriptor in subscriber_list
+
+    # now send a shutdown message to terminate the listener
+    return_code = drg_process.returncode
+
+    # clean up if the OnShutdownRequested wasn't properly handled
+    if return_code is None and drg_process.is_alive:
+        drg_process.kill()
+        drg_process.join()
+
+
+def test_dragonbackend_backend_consumer(the_backend: DragonBackend):
+    """Verify the listener background process updates the appropriate
+    value in the backbone."""
+
+    # We need to let the backend create the backbone to continue
+    backbone = the_backend._create_backbone()
+    backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS)
+    backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
+
+    assert backbone._allow_reserved_writes
+
+    # create listener with `as_service=False` to perform a single loop iteration
+    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=False)
+
+    logger.debug(f"backbone loaded? {listener._backbone}")
+    logger.debug(f"listener created? {listener}")
+
+    try:
+        # call the service execute method directly to trigger
+        # the entire service lifecycle
+        listener.execute()
+
+        consumer_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
+        logger.debug(f"MLI_REGISTRAR_CONSUMER: {consumer_desc}")
+
+        assert consumer_desc
+    except Exception as ex:
+        logger.info("")
+    finally:
+        listener._on_shutdown()
+
+
+def test_dragonbackend_event_handled(the_backend: DragonBackend):
+    """Verify the event listener process updates the appropriate
+    value in the backbone when an event is received and again on shutdown.
+    """
+    # We need to let the backend create the backbone to continue
+    backbone = the_backend._create_backbone()
+    backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS)
+    backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
+
+    # create the listener to be tested
+    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=False)
+
+    assert listener._backbone, "The listener is not attached to a backbone"
+
+    try:
+        # set up the listener but don't let the service event loop start
+        listener._create_eventing()  # listener.execute()
+
+        # grab the channel descriptor so we can simulate registrations
+        channel_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
+        comm_channel = DragonCommChannel.from_descriptor(channel_desc)
+
+        num_events = 5
+        events = []
+        for i in range(num_events):
+            # register some mock consumers using the backend channel
+            event = OnCreateConsumer(
+                "test_dragonbackend_event_handled",
+                f"mock-consumer-descriptor-{uuid.uuid4()}",
+                [],
+            )
+            event_bytes = bytes(event)
+            comm_channel.send(event_bytes)
+            events.append(event)
+
+        # run few iterations of the event loop in case it takes a few cycles to write
+        for _ in range(20):
+            listener._on_iteration()
+            # Grab the value that should be getting updated
+            notify_consumers = set(backbone.notification_channels)
+            if len(notify_consumers) == len(events):
+                logger.info(f"Retrieved all consumers after {i} listen cycles")
+                break
+
+        # ... and confirm that all the mock consumer descriptors are registered
+        assert set([e.descriptor for e in events]) == set(notify_consumers)
+        logger.info(f"Number of registered consumers: {len(notify_consumers)}")
+
+    except Exception as ex:
+        logger.exception(f"test_dragonbackend_event_handled - exception occurred: {ex}")
+        assert False
+    finally:
+        # shutdown should unregister a registration listener
+        listener._on_shutdown()
+
+    for i in range(10):
+        if BackboneFeatureStore.MLI_REGISTRAR_CONSUMER not in backbone:
+            logger.debug(f"The listener was removed after {i} iterations")
+            channel_desc = None
+            break
+
+    # we should see that there is no listener registered
+    assert not channel_desc, "Listener shutdown failed to clean up the backbone"
+
+
+def test_dragonbackend_shutdown_event(the_backend: DragonBackend):
+    """Verify the background process shuts down when it receives a
+    shutdown request."""
+
+    # We need to let the backend create the backbone to continue
+    backbone = the_backend._create_backbone()
+    backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS)
+    backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
+
+    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=True)
+
+    # set up the listener but don't let the listener loop start
+    listener._create_eventing()  # listener.execute()
+
+    # grab the channel descriptor so we can publish to it
+    channel_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
+    comm_channel = DragonCommChannel.from_descriptor(channel_desc)
+
+    assert listener._consumer.listening, "Listener isn't ready to listen"
+
+    # send a shutdown request...
+    event = OnShutdownRequested("test_dragonbackend_shutdown_event")
+    event_bytes = bytes(event)
+    comm_channel.send(event_bytes, 0.1)
+
+    # execute should encounter the shutdown and exit
+    listener.execute()
+
+    # ...and confirm the listener is now cancelled
+    assert not listener._consumer.listening
+
+
+@pytest.mark.parametrize("health_check_frequency", [10, 20])
+def test_dragonbackend_shutdown_on_health_check(
+    the_backend: DragonBackend,
+    health_check_frequency: float,
+):
+    """Verify that the event listener automatically shuts down when
+    a new listener is registered in its place.
+
+    :param health_check_frequency: The expected frequency of service health check
+     invocations"""
+
+    # We need to let the backend create the backbone to continue
+    backbone = the_backend._create_backbone()
+    backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS)
+    backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
+
+    listener = ConsumerRegistrationListener(
+        backbone,
+        1.0,
+        1.0,
+        as_service=True,  # allow service to run long enough to health check
+        health_check_frequency=health_check_frequency,
+    )
+
+    # set up the listener but don't let the listener loop start
+    listener._create_eventing()  # listener.execute()
+    assert listener._consumer.listening, "Listener wasn't ready to listen"
+
+    # Replace the consumer descriptor in the backbone to trigger
+    # an automatic shutdown
+    backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = str(uuid.uuid4())
+
+    # set the last health check manually to verify the duration
+    start_at = time.time()
+    listener._last_health_check = time.time()
+
+    # run execute to let the service trigger health checks
+    listener.execute()
+    elapsed = time.time() - start_at
+
+    # confirm the frequency of the health check was honored
+    assert elapsed >= health_check_frequency
+
+    # ...and confirm the listener is now cancelled
+    assert (
+        not listener._consumer.listening
+    ), "Listener was not automatically shutdown by the health check"
diff --git a/tests/dragon_wlm/test_dragon_comm_utils.py b/tests/dragon_wlm/test_dragon_comm_utils.py
new file mode 100644
index 0000000000..a6f9c206a4
--- /dev/null
+++ b/tests/dragon_wlm/test_dragon_comm_utils.py
@@ -0,0 +1,257 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import base64
+import pathlib
+import uuid
+
+import pytest
+
+from smartsim.error.errors import SmartSimError
+
+dragon = pytest.importorskip("dragon")
+
+# isort: off
+import dragon.channels as dch
+import dragon.infrastructure.parameters as dp
+import dragon.managed_memory as dm
+import dragon.fli as fli
+
+# isort: on
+
+from smartsim._core.mli.comm.channel import dragon_util
+from smartsim.log import get_logger
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+logger = get_logger(__name__)
+
+
+@pytest.fixture(scope="function")
+def the_pool() -> dm.MemoryPool:
+    """Creates a memory pool."""
+    raw_pool_descriptor = dp.this_process.default_pd
+    descriptor_ = base64.b64decode(raw_pool_descriptor)
+
+    pool = dm.MemoryPool.attach(descriptor_)
+    return pool
+
+
+@pytest.fixture(scope="function")
+def the_channel() -> dch.Channel:
+    """Creates a Channel attached to the local memory pool."""
+    channel = dch.Channel.make_process_local()
+    return channel
+
+
+@pytest.fixture(scope="function")
+def the_fli(the_channel) -> fli.FLInterface:
+    """Creates an FLI attached to the local memory pool."""
+    fli_ = fli.FLInterface(main_ch=the_channel, manager_ch=None)
+    return fli_
+
+
+def test_descriptor_to_channel_empty() -> None:
+    """Verify that `descriptor_to_channel` raises an exception when
+    provided with an empty descriptor."""
+    descriptor = ""
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    assert "empty" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    ["a", "ab", "abc", "x1", pathlib.Path(".").absolute().as_posix()],
+)
+def test_descriptor_to_channel_b64fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_channel` raises an exception when
+    provided with an incorrectly encoded descriptor.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    assert "base64" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    [str(uuid.uuid4())],
+)
+def test_descriptor_to_channel_channel_fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_channel` raises an exception when a correctly
+    formatted descriptor that does not describe a real channel is passed.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    # ensure we're receiving the right exception
+    assert "address" in ex.value.args[0]
+    assert "channel" in ex.value.args[0]
+
+
+def test_descriptor_to_channel_channel_not_available(the_channel: dch.Channel) -> None:
+    """Verify that `descriptor_to_channel` raises an exception when a channel
+    is no longer available.
+
+    :param the_channel: A dragon channel
+    """
+
+    # get a good descriptor & wipe out the channel so it can't be attached
+    descriptor = dragon_util.channel_to_descriptor(the_channel)
+    the_channel.destroy()
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    assert "address" in ex.value.args[0]
+
+
+def test_descriptor_to_channel_happy_path(the_channel: dch.Channel) -> None:
+    """Verify that `descriptor_to_channel` works as expected when provided
+    a valid descriptor
+
+    :param the_channel: A dragon channel
+    """
+
+    # get a good descriptor
+    descriptor = dragon_util.channel_to_descriptor(the_channel)
+
+    reattached = dragon_util.descriptor_to_channel(descriptor)
+    assert reattached
+
+    # and just make sure creation of the descriptor is transitive
+    assert dragon_util.channel_to_descriptor(reattached) == descriptor
+
+
+def test_descriptor_to_fli_empty() -> None:
+    """Verify that `descriptor_to_fli` raises an exception when
+    provided with an empty descriptor."""
+    descriptor = ""
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    assert "empty" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    ["a", "ab", "abc", "x1", pathlib.Path(".").absolute().as_posix()],
+)
+def test_descriptor_to_fli_b64fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_fli` raises an exception when
+    provided with an incorrectly encoded descriptor.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    assert "base64" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    [str(uuid.uuid4())],
+)
+def test_descriptor_to_fli_fli_fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_fli` raises an exception when a correctly
+    formatted descriptor that does not describe a real FLI is passed.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    # ensure we're receiving the right exception
+    assert "address" in ex.value.args[0]
+    assert "fli" in ex.value.args[0].lower()
+
+
+def test_descriptor_to_fli_fli_not_available(
+    the_fli: fli.FLInterface, the_channel: dch.Channel
+) -> None:
+    """Verify that `descriptor_to_fli` raises an exception when a channel
+    is no longer available.
+
+    :param the_fli: A dragon FLInterface
+    :param the_channel: A dragon channel
+    """
+
+    # get a good descriptor & wipe out the FLI so it can't be attached
+    descriptor = dragon_util.channel_to_descriptor(the_fli)
+    the_fli.destroy()
+    the_channel.destroy()
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    # ensure we're receiving the right exception
+    assert "address" in ex.value.args[0]
+
+
+def test_descriptor_to_fli_happy_path(the_fli: dch.Channel) -> None:
+    """Verify that `descriptor_to_fli` works as expected when provided
+    a valid descriptor
+
+    :param the_fli: A dragon FLInterface
+    """
+
+    # get a good descriptor
+    descriptor = dragon_util.channel_to_descriptor(the_fli)
+
+    reattached = dragon_util.descriptor_to_fli(descriptor)
+    assert reattached
+
+    # and just make sure creation of the descriptor is transitive
+    assert dragon_util.channel_to_descriptor(reattached) == descriptor
+
+
+def test_pool_to_descriptor_empty() -> None:
+    """Verify that `pool_to_descriptor` raises an exception when
+    provided with a null pool."""
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.pool_to_descriptor(None)
+
+
+def test_pool_to_happy_path(the_pool) -> None:
+    """Verify that `pool_to_descriptor` creates a descriptor
+    when supplied with a valid memory pool."""
+
+    descriptor = dragon_util.pool_to_descriptor(the_pool)
+    assert descriptor
diff --git a/tests/dragon_wlm/test_dragon_ddict_utils.py b/tests/dragon_wlm/test_dragon_ddict_utils.py
new file mode 100644
index 0000000000..c8bf687ef1
--- /dev/null
+++ b/tests/dragon_wlm/test_dragon_ddict_utils.py
@@ -0,0 +1,117 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+# isort: off
+import dragon.data.ddict.ddict as dragon_ddict
+
+# isort: on
+
+from smartsim._core.mli.infrastructure.storage import dragon_util
+from smartsim.log import get_logger
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+logger = get_logger(__name__)
+
+
+@pytest.mark.parametrize(
+    "num_nodes, num_managers, mem_per_node",
+    [
+        pytest.param(1, 1, 3 * 1024**2, id="3MB, Bare minimum allocation"),
+        pytest.param(2, 2, 128 * 1024**2, id="128 MB allocation, 2 nodes, 2 mgr"),
+        pytest.param(2, 1, 512 * 1024**2, id="512 MB allocation, 2 nodes, 1 mgr"),
+    ],
+)
+def test_dragon_storage_util_create_ddict(
+    num_nodes: int,
+    num_managers: int,
+    mem_per_node: int,
+):
+    """Verify that a dragon dictionary is successfully created.
+
+    :param num_nodes: Number of ddict nodes to attempt to create
+    :param num_managers: Number of managers per node to request
+    :param num_managers: Memory to allocate per node
+    """
+    ddict = dragon_util.create_ddict(num_nodes, num_managers, mem_per_node)
+
+    assert ddict is not None
+
+
+@pytest.mark.parametrize(
+    "num_nodes, num_managers, mem_per_node",
+    [
+        pytest.param(-1, 1, 3 * 1024**2, id="Negative Node Count"),
+        pytest.param(0, 1, 3 * 1024**2, id="Invalid Node Count"),
+        pytest.param(1, -1, 3 * 1024**2, id="Negative Mgr Count"),
+        pytest.param(1, 0, 3 * 1024**2, id="Invalid Mgr Count"),
+        pytest.param(1, 1, -3 * 1024**2, id="Negative Mem Per Node"),
+        pytest.param(1, 1, (3 * 1024**2) - 1, id="Invalid Mem Per Node"),
+        pytest.param(1, 1, 0 * 1024**2, id="No Mem Per Node"),
+    ],
+)
+def test_dragon_storage_util_create_ddict_validators(
+    num_nodes: int,
+    num_managers: int,
+    mem_per_node: int,
+):
+    """Verify that a dragon dictionary is successfully created.
+
+    :param num_nodes: Number of ddict nodes to attempt to create
+    :param num_managers: Number of managers per node to request
+    :param num_managers: Memory to allocate per node
+    """
+    with pytest.raises(ValueError):
+        dragon_util.create_ddict(num_nodes, num_managers, mem_per_node)
+
+
+def test_dragon_storage_util_get_ddict_descriptor(the_storage: dragon_ddict.DDict):
+    """Verify that a descriptor is created.
+
+    :param the_storage: A pre-allocated ddict
+    """
+    value = dragon_util.ddict_to_descriptor(the_storage)
+
+    assert isinstance(value, str)
+    assert len(value) > 0
+
+
+def test_dragon_storage_util_get_ddict_from_descriptor(the_storage: dragon_ddict.DDict):
+    """Verify that a ddict is created from a descriptor.
+
+    :param the_storage: A pre-allocated ddict
+    """
+    descriptor = dragon_util.ddict_to_descriptor(the_storage)
+
+    value = dragon_util.descriptor_to_ddict(descriptor)
+
+    assert value is not None
+    assert isinstance(value, dragon_ddict.DDict)
+    assert dragon_util.ddict_to_descriptor(value) == descriptor
diff --git a/tests/dragon_wlm/test_environment_loader.py b/tests/dragon_wlm/test_environment_loader.py
new file mode 100644
index 0000000000..07b2a45c1c
--- /dev/null
+++ b/tests/dragon_wlm/test_environment_loader.py
@@ -0,0 +1,147 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+import dragon.data.ddict.ddict as dragon_ddict
+import dragon.utils as du
+from dragon.fli import FLInterface
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    DragonFeatureStore,
+)
+from smartsim.error.errors import SmartSimError
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+@pytest.mark.parametrize(
+    "content",
+    [
+        pytest.param(b"a"),
+        pytest.param(b"new byte string"),
+    ],
+)
+def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch):
+    """A descriptor can be stored, loaded, and reattached."""
+    chan = create_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv(
+        EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR,
+        du.B64.bytes_to_str(queue.serialize()),
+    )
+
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    config_queue = config.get_queue()
+
+    _ = config_queue.send(content)
+
+    old_recv = queue.recvh()
+    result, _ = old_recv.recv_bytes()
+    assert result == content
+
+
+def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
+    """The serialized descriptors of a loaded and unloaded
+    queue are the same."""
+    chan = create_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv(
+        EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR,
+        du.B64.bytes_to_str(queue.serialize()),
+    )
+
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    config_queue = config.get_queue()
+    assert config_queue._fli.serialize() == queue.serialize()
+
+
+def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
+    """An incorrect serialized descriptor will fails to attach."""
+
+    monkeypatch.setenv(EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, "randomstring")
+
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=None,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    with pytest.raises(SmartSimError):
+        config.get_queue()
+
+
+def test_environment_loader_backbone_load_dfs(
+    monkeypatch: pytest.MonkeyPatch, the_storage: dragon_ddict.DDict
+):
+    """Verify the dragon feature store is loaded correctly by the
+    EnvironmentConfigLoader to demonstrate featurestore_factory correctness."""
+    feature_store = DragonFeatureStore(the_storage)
+    monkeypatch.setenv(
+        EnvironmentConfigLoader.BACKBONE_ENV_VAR, feature_store.descriptor
+    )
+
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=None,
+        queue_factory=None,
+    )
+
+    print(f"calling config.get_backbone: `{feature_store.descriptor}`")
+
+    backbone = config.get_backbone()
+    assert backbone is not None
+
+
+def test_environment_variables_not_set(monkeypatch: pytest.MonkeyPatch):
+    """EnvironmentConfigLoader getters return None when environment
+    variables are not set."""
+    with monkeypatch.context() as patch:
+        patch.setenv(EnvironmentConfigLoader.BACKBONE_ENV_VAR, "")
+        patch.setenv(EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, "")
+
+        config = EnvironmentConfigLoader(
+            featurestore_factory=DragonFeatureStore.from_descriptor,
+            callback_factory=DragonCommChannel.from_descriptor,
+            queue_factory=DragonCommChannel.from_descriptor,
+        )
+        assert config.get_backbone() is None
+        assert config.get_queue() is None
diff --git a/tests/dragon_wlm/test_error_handling.py b/tests/dragon_wlm/test_error_handling.py
new file mode 100644
index 0000000000..aacd47b556
--- /dev/null
+++ b/tests/dragon_wlm/test_error_handling.py
@@ -0,0 +1,511 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+from unittest.mock import MagicMock
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+import multiprocessing as mp
+
+from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
+from dragon.fli import FLInterface
+from dragon.mpbridge.queues import DragonQueue
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.control.request_dispatcher import (
+    RequestDispatcher,
+)
+from smartsim._core.mli.infrastructure.control.worker_manager import (
+    WorkerManager,
+    exception_handler,
+)
+from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.feature_store import (
+    FeatureStore,
+    ModelKey,
+    TensorKey,
+)
+from smartsim._core.mli.infrastructure.worker.worker import (
+    ExecuteResult,
+    FetchInputResult,
+    FetchModelResult,
+    InferenceRequest,
+    LoadModelResult,
+    MachineLearningWorkerBase,
+    RequestBatch,
+    TransformInputResult,
+    TransformOutputResult,
+)
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim._core.mli.mli_schemas.response.response_capnp import ResponseBuilder
+
+from .utils.channel import FileSystemCommChannel
+from .utils.worker import IntegratedTorchWorker
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+@pytest.fixture(scope="module")
+def app_feature_store(the_storage) -> FeatureStore:
+    # create a standalone feature store to mimic a user application putting
+    # data into an application-owned resource (app should not access backbone)
+    app_fs = DragonFeatureStore(the_storage)
+    return app_fs
+
+
+@pytest.fixture
+def setup_worker_manager_model_bytes(
+    test_dir: str,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+    the_worker_channel: DragonFLIChannel,
+):
+    integrated_worker_type = IntegratedTorchWorker
+
+    monkeypatch.setenv(
+        BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor
+    )
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor)
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0)
+
+    worker_manager = WorkerManager(
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+        dispatcher_queue=dispatcher_task_queue,
+        as_service=False,
+        cooldown=3,
+    )
+
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+
+    inf_request = InferenceRequest(
+        model_key=None,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=b"model",
+        batch_size=0,
+    )
+
+    model_id = ModelKey(key="key", descriptor=app_feature_store.descriptor)
+
+    request_batch = RequestBatch(
+        [inf_request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
+        model_id=model_id,
+    )
+
+    dispatcher_task_queue.put(request_batch)
+    return worker_manager, integrated_worker_type
+
+
+@pytest.fixture
+def setup_worker_manager_model_key(
+    test_dir: str,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+    the_worker_channel: DragonFLIChannel,
+):
+    integrated_worker_type = IntegratedTorchWorker
+
+    monkeypatch.setenv(
+        BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor
+    )
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor)
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0)
+
+    worker_manager = WorkerManager(
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+        dispatcher_queue=dispatcher_task_queue,
+        as_service=False,
+        cooldown=3,
+    )
+
+    tensor_key = TensorKey(key="key", descriptor=app_feature_store.descriptor)
+    output_key = TensorKey(key="key", descriptor=app_feature_store.descriptor)
+    model_id = ModelKey(key="model key", descriptor=app_feature_store.descriptor)
+
+    request = InferenceRequest(
+        model_key=model_id,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=b"model",
+        batch_size=0,
+    )
+    request_batch = RequestBatch(
+        [request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
+        model_id=model_id,
+    )
+
+    dispatcher_task_queue.put(request_batch)
+    return worker_manager, integrated_worker_type
+
+
+@pytest.fixture
+def setup_request_dispatcher_model_bytes(
+    test_dir: str,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+    the_worker_channel: DragonFLIChannel,
+):
+    integrated_worker_type = IntegratedTorchWorker
+
+    monkeypatch.setenv(
+        BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor
+    )
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor)
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    request_dispatcher = RequestDispatcher(
+        batch_timeout=0,
+        batch_size=0,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+    )
+    request_dispatcher._on_start()
+
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    model = MessageHandler.build_model(b"model", "model name", "v 0.0.1")
+    request = MessageHandler.build_request(
+        test_dir, model, [tensor_key], [output_key], [], None
+    )
+    ser_request = MessageHandler.serialize_request(request)
+
+    request_dispatcher._incoming_channel.send(ser_request)
+
+    return request_dispatcher, integrated_worker_type
+
+
+@pytest.fixture
+def setup_request_dispatcher_model_key(
+    test_dir: str,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+    the_worker_channel: DragonFLIChannel,
+):
+    integrated_worker_type = IntegratedTorchWorker
+
+    monkeypatch.setenv(
+        BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor
+    )
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor)
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    request_dispatcher = RequestDispatcher(
+        batch_timeout=0,
+        batch_size=0,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+    )
+    request_dispatcher._on_start()
+
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    model_key = MessageHandler.build_model_key(
+        key="model key", descriptor=app_feature_store.descriptor
+    )
+    request = MessageHandler.build_request(
+        test_dir, model_key, [tensor_key], [output_key], [], None
+    )
+    ser_request = MessageHandler.serialize_request(request)
+
+    request_dispatcher._incoming_channel.send(ser_request)
+
+    return request_dispatcher, integrated_worker_type
+
+
+def mock_pipeline_stage(
+    monkeypatch: pytest.MonkeyPatch,
+    integrated_worker: MachineLearningWorkerBase,
+    stage: str,
+) -> t.Callable[[t.Any], ResponseBuilder]:
+    def mock_stage(*args: t.Any, **kwargs: t.Any) -> None:
+        raise ValueError(f"Simulated error in {stage}")
+
+    monkeypatch.setattr(integrated_worker, stage, mock_stage)
+    mock_reply_fn = MagicMock()
+    mock_response = MagicMock()
+    mock_response.schema.node.displayName = "Response"
+    mock_reply_fn.return_value = mock_response
+
+    monkeypatch.setattr(
+        "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply",
+        mock_reply_fn,
+    )
+
+    mock_reply_channel = MagicMock()
+    mock_reply_channel.send = MagicMock()
+
+    def mock_exception_handler(
+        exc: Exception, reply_channel: CommChannelBase, failure_message: str
+    ) -> None:
+        exception_handler(exc, mock_reply_channel, failure_message)
+
+    monkeypatch.setattr(
+        "smartsim._core.mli.infrastructure.control.worker_manager.exception_handler",
+        mock_exception_handler,
+    )
+
+    monkeypatch.setattr(
+        "smartsim._core.mli.infrastructure.control.request_dispatcher.exception_handler",
+        mock_exception_handler,
+    )
+
+    return mock_reply_fn
+
+
+@pytest.mark.parametrize(
+    "setup_worker_manager",
+    [
+        pytest.param("setup_worker_manager_model_bytes"),
+        pytest.param("setup_worker_manager_model_key"),
+    ],
+)
+@pytest.mark.parametrize(
+    "stage, error_message",
+    [
+        pytest.param(
+            "fetch_model",
+            "Error loading model on device or getting device.",
+            id="fetch model",
+        ),
+        pytest.param(
+            "load_model",
+            "Error loading model on device or getting device.",
+            id="load model",
+        ),
+        pytest.param("execute", "Error while executing.", id="execute"),
+        pytest.param(
+            "transform_output",
+            "Error while transforming the output.",
+            id="transform output",
+        ),
+        pytest.param(
+            "place_output", "Error while placing the output.", id="place output"
+        ),
+    ],
+)
+def test_wm_pipeline_stage_errors_handled(
+    request: pytest.FixtureRequest,
+    setup_worker_manager: str,
+    monkeypatch: pytest.MonkeyPatch,
+    stage: str,
+    error_message: str,
+) -> None:
+    """Ensures that the worker manager does not crash after a failure in various pipeline stages"""
+    worker_manager, integrated_worker_type = request.getfixturevalue(
+        setup_worker_manager
+    )
+    integrated_worker = worker_manager._worker
+
+    worker_manager._on_start()
+    device = worker_manager._device_manager._device
+    mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage)
+
+    if stage not in ["fetch_model"]:
+        monkeypatch.setattr(
+            integrated_worker,
+            "fetch_model",
+            MagicMock(return_value=FetchModelResult(b"result_bytes")),
+        )
+    if stage not in ["fetch_model", "load_model"]:
+        monkeypatch.setattr(
+            integrated_worker,
+            "load_model",
+            MagicMock(return_value=LoadModelResult(b"result_bytes")),
+        )
+        monkeypatch.setattr(
+            device,
+            "get_model",
+            MagicMock(return_value=b"result_bytes"),
+        )
+    if stage not in [
+        "fetch_model",
+        "execute",
+    ]:
+        monkeypatch.setattr(
+            integrated_worker,
+            "execute",
+            MagicMock(return_value=ExecuteResult(b"result_bytes", [slice(0, 1)])),
+        )
+    if stage not in [
+        "fetch_model",
+        "execute",
+        "transform_output",
+    ]:
+        monkeypatch.setattr(
+            integrated_worker,
+            "transform_output",
+            MagicMock(
+                return_value=[TransformOutputResult(b"result", [], "c", "float32")]
+            ),
+        )
+
+    worker_manager._on_iteration()
+
+    mock_reply_fn.assert_called_once()
+    mock_reply_fn.assert_called_with("fail", error_message)
+
+
+@pytest.mark.parametrize(
+    "setup_request_dispatcher",
+    [
+        pytest.param("setup_request_dispatcher_model_bytes"),
+        pytest.param("setup_request_dispatcher_model_key"),
+    ],
+)
+@pytest.mark.parametrize(
+    "stage, error_message",
+    [
+        pytest.param(
+            "fetch_inputs",
+            "Error fetching input.",
+            id="fetch input",
+        ),
+        pytest.param(
+            "transform_input",
+            "Error transforming input.",
+            id="transform input",
+        ),
+    ],
+)
+def test_dispatcher_pipeline_stage_errors_handled(
+    request: pytest.FixtureRequest,
+    setup_request_dispatcher: str,
+    monkeypatch: pytest.MonkeyPatch,
+    stage: str,
+    error_message: str,
+) -> None:
+    """Ensures that the request dispatcher does not crash after a failure in various pipeline stages"""
+    request_dispatcher, integrated_worker_type = request.getfixturevalue(
+        setup_request_dispatcher
+    )
+    integrated_worker = request_dispatcher._worker
+
+    mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage)
+
+    if stage not in ["fetch_inputs"]:
+        monkeypatch.setattr(
+            integrated_worker,
+            "fetch_inputs",
+            MagicMock(return_value=[FetchInputResult(result=[b"result"], meta=None)]),
+        )
+
+    request_dispatcher._on_iteration()
+
+    mock_reply_fn.assert_called_once()
+    mock_reply_fn.assert_called_with("fail", error_message)
+
+
+def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Ensures that the worker manager does not crash after a failure in the
+    execute pipeline stage"""
+
+    mock_reply_channel = MagicMock()
+    mock_reply_channel.send = MagicMock()
+
+    mock_reply_fn = MagicMock()
+
+    mock_response = MagicMock()
+    mock_response.schema.node.displayName = "Response"
+    mock_reply_fn.return_value = mock_response
+
+    monkeypatch.setattr(
+        "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply",
+        mock_reply_fn,
+    )
+
+    test_exception = ValueError("Test ValueError")
+    exception_handler(
+        test_exception, mock_reply_channel, "Failure while fetching the model."
+    )
+
+    mock_reply_fn.assert_called_once()
+    mock_reply_fn.assert_called_with("fail", "Failure while fetching the model.")
+
+
+def test_dragon_feature_store_invalid_storage():
+    """Verify that attempting to create a DragonFeatureStore without storage fails."""
+    storage = None
+
+    with pytest.raises(ValueError) as ex:
+        DragonFeatureStore(storage)
+
+    assert "storage" in ex.value.args[0].lower()
+    assert "required" in ex.value.args[0].lower()
diff --git a/tests/dragon_wlm/test_event_consumer.py b/tests/dragon_wlm/test_event_consumer.py
new file mode 100644
index 0000000000..8a241bab19
--- /dev/null
+++ b/tests/dragon_wlm/test_event_consumer.py
@@ -0,0 +1,386 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import time
+import typing as t
+from unittest import mock
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster
+from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer
+from smartsim._core.mli.infrastructure.comm.event import (
+    OnCreateConsumer,
+    OnShutdownRequested,
+    OnWriteFeatureStore,
+)
+from smartsim._core.mli.infrastructure.control.listener import (
+    ConsumerRegistrationListener,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+# isort: off
+from dragon import fli
+from dragon.channels import Channel
+
+# isort: on
+
+if t.TYPE_CHECKING:
+    import conftest
+
+
+# The tests in this file must run in a dragon environment
+pytestmark = pytest.mark.dragon
+
+
+def test_eventconsumer_eventpublisher_integration(
+    the_backbone: t.Any, test_dir: str
+) -> None:
+    """Verify that the publisher and consumer integrate as expected when
+    multiple publishers and consumers are sending simultaneously. This
+    test closely tracks the test in tests/test_featurestore_base.py also named
+    test_eventconsumer_eventpublisher_integration but requires dragon entities.
+
+    :param the_backbone: The BackboneFeatureStore to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    wmgr_channel = DragonCommChannel(create_local())
+    capp_channel = DragonCommChannel(create_local())
+    back_channel = DragonCommChannel(create_local())
+
+    wmgr_consumer_descriptor = wmgr_channel.descriptor
+    capp_consumer_descriptor = capp_channel.descriptor
+    back_consumer_descriptor = back_channel.descriptor
+
+    # create some consumers to receive messages
+    wmgr_consumer = EventConsumer(
+        wmgr_channel,
+        the_backbone,
+        filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN],
+    )
+    capp_consumer = EventConsumer(
+        capp_channel,
+        the_backbone,
+    )
+    back_consumer = EventConsumer(
+        back_channel,
+        the_backbone,
+        filters=[OnCreateConsumer.CONSUMER_CREATED],
+    )
+
+    # create some broadcasters to publish messages
+    mock_worker_mgr = EventBroadcaster(
+        the_backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+    mock_client_app = EventBroadcaster(
+        the_backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+
+    # register all of the consumers even though the OnCreateConsumer really should
+    # trigger its registration. event processing is tested elsewhere.
+    the_backbone.notification_channels = [
+        wmgr_consumer_descriptor,
+        capp_consumer_descriptor,
+        back_consumer_descriptor,
+    ]
+
+    # simulate worker manager sending a notification to backend that it's alive
+    event_1 = OnCreateConsumer(
+        "test_eventconsumer_eventpublisher_integration",
+        wmgr_consumer_descriptor,
+        filters=[],
+    )
+    mock_worker_mgr.send(event_1)
+
+    # simulate the app updating a model a few times
+    for key in ["key-1", "key-2", "key-1"]:
+        event = OnWriteFeatureStore(
+            "test_eventconsumer_eventpublisher_integration",
+            the_backbone.descriptor,
+            key,
+        )
+        mock_client_app.send(event, timeout=0.1)
+
+    # worker manager should only get updates about feature update
+    wmgr_messages = wmgr_consumer.recv()
+    assert len(wmgr_messages) == 3
+
+    # the backend should only receive messages about consumer creation
+    back_messages = back_consumer.recv()
+    assert len(back_messages) == 1
+
+    # hypothetical app has no filters and will get all events
+    app_messages = capp_consumer.recv()
+    assert len(app_messages) == 4
+
+
+@pytest.mark.parametrize(
+    " timeout, batch_timeout, exp_err_msg",
+    [(-1, 1, " timeout"), (1, -1, "batch_timeout")],
+)
+def test_eventconsumer_invalid_timeout(
+    timeout: float,
+    batch_timeout: float,
+    exp_err_msg: str,
+    test_dir: str,
+    the_backbone: BackboneFeatureStore,
+) -> None:
+    """Verify that the event consumer raises an exception
+    when provided an invalid request timeout.
+
+    :param timeout: The request timeout for the event consumer recv call
+    :param batch_timeout: The batch timeout for the event consumer recv call
+    :param exp_err_msg: A unique value from the error message that should be raised
+    :param the_storage: The dragon storage engine to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    wmgr_channel = DragonCommChannel(create_local())
+
+    # create some consumers to receive messages
+    wmgr_consumer = EventConsumer(
+        wmgr_channel,
+        the_backbone,
+        filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN],
+    )
+
+    # the consumer should report an error for the invalid timeout value
+    with pytest.raises(ValueError) as ex:
+        wmgr_consumer.recv(timeout=timeout, batch_timeout=batch_timeout)
+
+    assert exp_err_msg in ex.value.args[0]
+
+
+def test_eventconsumer_no_event_handler_registered(
+    the_backbone: t.Any, test_dir: str
+) -> None:
+    """Verify that a consumer discards messages when
+    on a channel if no handler is registered.
+
+    :param the_backbone: The BackboneFeatureStore to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    wmgr_channel = DragonCommChannel(create_local())
+
+    # create a consumer to receive messages
+    wmgr_consumer = EventConsumer(wmgr_channel, the_backbone, event_handler=None)
+
+    # create a broadcasters to publish messages
+    mock_worker_mgr = EventBroadcaster(
+        the_backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+
+    # manually register the consumers since we don't have a backend running
+    the_backbone.notification_channels = [wmgr_channel.descriptor]
+
+    # simulate the app updating a model a few times
+    for key in ["key-1", "key-2", "key-1"]:
+        event = OnWriteFeatureStore(
+            "test_eventconsumer_no_event_handler_registered",
+            the_backbone.descriptor,
+            key,
+        )
+        mock_worker_mgr.send(event, timeout=0.1)
+
+    # run the handler and let it discard messages
+    for _ in range(15):
+        wmgr_consumer.listen_once(0.2, 2.0)
+
+    assert wmgr_consumer.listening
+
+
+def test_eventconsumer_no_event_handler_registered_shutdown(
+    the_backbone: t.Any, test_dir: str
+) -> None:
+    """Verify that a consumer without an event handler
+    registered still honors shutdown requests.
+
+    :param the_backbone: The BackboneFeatureStore to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    wmgr_channel = DragonCommChannel(create_local())
+    capp_channel = DragonCommChannel(create_local())
+
+    # create a consumers to receive messages
+    wmgr_consumer = EventConsumer(wmgr_channel, the_backbone)
+
+    # create a broadcaster to publish messages
+    mock_worker_mgr = EventBroadcaster(
+        the_backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+
+    # manually register the consumers since we don't have a backend running
+    the_backbone.notification_channels = [
+        wmgr_channel.descriptor,
+        capp_channel.descriptor,
+    ]
+
+    # simulate the app updating a model a few times
+    for key in ["key-1", "key-2", "key-1"]:
+        event = OnWriteFeatureStore(
+            "test_eventconsumer_no_event_handler_registered_shutdown",
+            the_backbone.descriptor,
+            key,
+        )
+        mock_worker_mgr.send(event, timeout=0.1)
+
+    event = OnShutdownRequested(
+        "test_eventconsumer_no_event_handler_registered_shutdown"
+    )
+    mock_worker_mgr.send(event, timeout=0.1)
+
+    # wmgr will stop listening to messages when it is told to stop listening
+    wmgr_consumer.listen(timeout=0.1, batch_timeout=2.0)
+
+    for _ in range(15):
+        wmgr_consumer.listen_once(timeout=0.1, batch_timeout=2.0)
+
+    # confirm the messages were processed, discarded, and the shutdown was received
+    assert wmgr_consumer.listening == False
+
+
+def test_eventconsumer_registration(
+    the_backbone: t.Any, test_dir: str, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that a consumer is correctly registered in
+    the backbone after sending a registration request. Then,
+    Confirm the consumer is unregistered after sending the
+    un-register request.
+
+    :param the_backbone: The BackboneFeatureStore to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    with monkeypatch.context() as patch:
+        registrar = ConsumerRegistrationListener(
+            the_backbone, 1.0, 2.0, as_service=False
+        )
+
+        # NOTE: service.execute(as_service=False) will complete the service life-
+        # cycle and remove the registrar from the backbone, so mock _on_shutdown
+        disabled_shutdown = mock.MagicMock()
+        patch.setattr(registrar, "_on_shutdown", disabled_shutdown)
+
+        # initialze registrar resources
+        registrar.execute()
+
+        # create a consumer that will be registered
+        wmgr_channel = DragonCommChannel(create_local())
+        wmgr_consumer = EventConsumer(wmgr_channel, the_backbone)
+
+        registered_channels = the_backbone.notification_channels
+
+        # trigger the consumer-to-registrar handshake
+        wmgr_consumer.register()
+
+        current_registrations: t.List[str] = []
+
+        # have the registrar run a few times to pick up the msg
+        for i in range(15):
+            registrar.execute()
+            current_registrations = the_backbone.notification_channels
+            if len(current_registrations) != len(registered_channels):
+                logger.debug(f"The event was processed on iteration {i}")
+                break
+
+        # confirm the consumer is registered
+        assert wmgr_channel.descriptor in current_registrations
+
+        # copy old list so we can compare against it.
+        registered_channels = list(current_registrations)
+
+        # trigger the consumer removal
+        wmgr_consumer.unregister()
+
+        # have the registrar run a few times to pick up the msg
+        for i in range(15):
+            registrar.execute()
+            current_registrations = the_backbone.notification_channels
+            if len(current_registrations) != len(registered_channels):
+                logger.debug(f"The event was processed on iteration {i}")
+                break
+
+        # confirm the consumer is no longer registered
+        assert wmgr_channel.descriptor not in current_registrations
+
+
+def test_registrar_teardown(
+    the_backbone: t.Any, test_dir: str, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that the consumer registrar removes itself from
+    the backbone when it shuts down.
+
+    :param the_backbone: The BackboneFeatureStore to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    with monkeypatch.context() as patch:
+        registrar = ConsumerRegistrationListener(
+            the_backbone, 1.0, 2.0, as_service=False
+        )
+
+        # directly initialze registrar resources to avoid service life-cycle
+        registrar._create_eventing()
+
+        # confirm the registrar is published to the backbone
+        cfg = the_backbone.wait_for([BackboneFeatureStore.MLI_REGISTRAR_CONSUMER], 10)
+        assert BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in cfg
+
+        # execute the entire service lifecycle 1x
+        registrar.execute()
+
+        consumer_found = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in the_backbone
+
+        for i in range(15):
+            time.sleep(0.1)
+            consumer_found = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in the_backbone
+            if not consumer_found:
+                logger.debug(f"Registrar removed from the backbone on iteration {i}")
+                break
+
+        assert BackboneFeatureStore.MLI_REGISTRAR_CONSUMER not in the_backbone
diff --git a/tests/dragon_wlm/test_featurestore.py b/tests/dragon_wlm/test_featurestore.py
new file mode 100644
index 0000000000..019dcde7a0
--- /dev/null
+++ b/tests/dragon_wlm/test_featurestore.py
@@ -0,0 +1,327 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import multiprocessing as mp
+import random
+import time
+import typing as t
+import unittest.mock as mock
+import uuid
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    time as bbtime,
+)
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+# isort: off
+from dragon import fli
+from dragon.channels import Channel
+
+# isort: on
+
+if t.TYPE_CHECKING:
+    import conftest
+
+
+# The tests in this file must run in a dragon environment
+pytestmark = pytest.mark.dragon
+
+
+def test_backbone_wait_for_no_keys(
+    the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that asking the backbone to wait for a value succeeds
+    immediately and does not cause a wait to occur if the supplied key
+    list is empty.
+
+    :param the_backbone: the storage engine to use, prepopulated with
+    """
+    # set a very low timeout to confirm that it does not wait
+
+    with monkeypatch.context() as ctx:
+        # all keys should be found and the timeout should never be checked.
+        ctx.setattr(bbtime, "sleep", mock.MagicMock())
+
+        values = the_backbone.wait_for([])
+        assert len(values) == 0
+
+        # confirm that no wait occurred
+        bbtime.sleep.assert_not_called()
+
+
+def test_backbone_wait_for_prepopulated(
+    the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that asking the backbone to wait for a value succeed
+    immediately and do not cause a wait to occur if the data exists.
+
+    :param the_backbone: the storage engine to use, prepopulated with
+    """
+    # set a very low timeout to confirm that it does not wait
+
+    with monkeypatch.context() as ctx:
+        # all keys should be found and the timeout should never be checked.
+        ctx.setattr(bbtime, "sleep", mock.MagicMock())
+
+        values = the_backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE], 0.1)
+
+        # confirm that wait_for with one key returns one value
+        assert len(values) == 1
+
+        # confirm that the descriptor is non-null w/some non-trivial value
+        assert len(values[BackboneFeatureStore.MLI_WORKER_QUEUE]) > 5
+
+        # confirm that no wait occurred
+        bbtime.sleep.assert_not_called()
+
+
+def test_backbone_wait_for_prepopulated_dupe(
+    the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that asking the backbone to wait for keys that are duplicated
+    results in a single value being returned for each key.
+
+    :param the_backbone: the storage engine to use, prepopulated with
+    """
+    # set a very low timeout to confirm that it does not wait
+
+    key1, key2 = "key-1", "key-2"
+    value1, value2 = "i-am-value-1", "i-am-value-2"
+    the_backbone[key1] = value1
+    the_backbone[key2] = value2
+
+    with monkeypatch.context() as ctx:
+        # all keys should be found and the timeout should never be checked.
+        ctx.setattr(bbtime, "sleep", mock.MagicMock())
+
+        values = the_backbone.wait_for([key1, key2, key1])  # key1 is duplicated
+
+        # confirm that wait_for with one key returns one value
+        assert len(values) == 2
+        assert key1 in values
+        assert key2 in values
+
+        assert values[key1] == value1
+        assert values[key2] == value2
+
+
+def set_value_after_delay(
+    descriptor: str, key: str, value: str, delay: float = 5
+) -> None:
+    """Helper method to persist a random value into the backbone
+
+    :param descriptor: the backbone feature store descriptor to attach to
+    :param key: the key to write to
+    :param value: a value to write to the key
+    :param delay: amount of delay to apply before writing the key
+    """
+    time.sleep(delay)
+
+    backbone = BackboneFeatureStore.from_descriptor(descriptor)
+    backbone[key] = value
+    logger.debug(f"set_value_after_delay wrote `{value} to backbone[`{key}`]")
+
+
+@pytest.mark.parametrize(
+    "delay",
+    [
+        pytest.param(
+            0,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            1,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            2,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            4,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            8,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+    ],
+)
+def test_backbone_wait_for_partial_prepopulated(
+    the_backbone: BackboneFeatureStore, delay: float
+) -> None:
+    """Verify that when data is not all in the backbone, the `wait_for` operation
+    continues to poll until it finds everything it needs.
+
+    :param the_backbone: the storage engine to use, prepopulated with
+    :param delay: the number of seconds the second process will wait before
+    setting the target value in the backbone featurestore
+    """
+    # set a very low timeout to confirm that it does not wait
+    wait_timeout = 10
+
+    key, value = str(uuid.uuid4()), str(random.random() * 10)
+
+    logger.debug(f"Starting process to write {key} after {delay}s")
+    p = mp.Process(
+        target=set_value_after_delay, args=(the_backbone.descriptor, key, value, delay)
+    )
+    p.start()
+
+    p2 = mp.Process(
+        target=the_backbone.wait_for,
+        args=([BackboneFeatureStore.MLI_WORKER_QUEUE, key],),
+        kwargs={"timeout": wait_timeout},
+    )
+    p2.start()
+
+    p.join()
+    p2.join()
+
+    # both values should be written at this time
+    ret_vals = the_backbone.wait_for(
+        [key, BackboneFeatureStore.MLI_WORKER_QUEUE, key], 0.1
+    )
+    # confirm that wait_for with two keys returns two values
+    assert len(ret_vals) == 2, "values should contain values for both awaited keys"
+
+    # confirm the pre-populated value has the correct output
+    assert (
+        ret_vals[BackboneFeatureStore.MLI_WORKER_QUEUE] == "12345"
+    )  # mock descriptor value from fixture
+
+    # confirm the population process completed and the awaited value is correct
+    assert ret_vals[key] == value, "verify order of values "
+
+
+@pytest.mark.parametrize(
+    "num_keys",
+    [
+        pytest.param(
+            0,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            1,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            3,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            7,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            11,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+    ],
+)
+def test_backbone_wait_for_multikey(
+    the_backbone: BackboneFeatureStore,
+    num_keys: int,
+    test_dir: str,
+) -> None:
+    """Verify that asking the backbone to wait for multiple keys results
+    in that number of values being returned.
+
+    :param the_backbone: the storage engine to use, prepopulated with
+    :param num_keys: the number of extra keys to set & request in the backbone
+    """
+    # maximum delay allowed for setter processes
+    max_delay = 5
+
+    extra_keys = [str(uuid.uuid4()) for _ in range(num_keys)]
+    extra_values = [str(uuid.uuid4()) for _ in range(num_keys)]
+    extras = dict(zip(extra_keys, extra_values))
+    delays = [random.random() * max_delay for _ in range(num_keys)]
+    processes = []
+
+    for key, value, delay in zip(extra_keys, extra_values, delays):
+        assert delay < max_delay, "write delay exceeds test timeout"
+        logger.debug(f"Delaying {key} write by {delay} seconds")
+        p = mp.Process(
+            target=set_value_after_delay,
+            args=(the_backbone.descriptor, key, value, delay),
+        )
+        p.start()
+        processes.append(p)
+
+    p2 = mp.Process(
+        target=the_backbone.wait_for,
+        args=(extra_keys,),
+        kwargs={"timeout": max_delay * 2},
+    )
+    p2.start()
+    for p in processes:
+        p.join(timeout=max_delay * 2)
+    p2.join(
+        timeout=max_delay * 2
+    )  # give it 10 seconds longer than p2 timeout for backoff
+
+    # use without a wait to verify all values are written
+    num_keys = len(extra_keys)
+    actual_values = the_backbone.wait_for(extra_keys, timeout=0.01)
+    assert len(extra_keys) == num_keys
+
+    # confirm that wait_for returns all the expected values
+    assert len(actual_values) == num_keys
+
+    # confirm that the returned values match (e.g. are returned in the right order)
+    for k in extras:
+        assert extras[k] == actual_values[k]
diff --git a/tests/dragon_wlm/test_featurestore_base.py b/tests/dragon_wlm/test_featurestore_base.py
new file mode 100644
index 0000000000..6daceb9061
--- /dev/null
+++ b/tests/dragon_wlm/test_featurestore_base.py
@@ -0,0 +1,844 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pathlib
+import time
+import typing as t
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster
+from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer
+from smartsim._core.mli.infrastructure.comm.event import (
+    OnCreateConsumer,
+    OnWriteFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys
+from smartsim.error import SmartSimError
+
+from .channel import FileSystemCommChannel
+from .feature_store import MemoryFeatureStore
+
+if t.TYPE_CHECKING:
+    import conftest
+
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+def boom(*args, **kwargs) -> None:
+    """Helper function that blows up when used to mock up
+    some other function."""
+    raise Exception(f"you shall not pass! {args}, {kwargs}")
+
+
+def test_event_uid() -> None:
+    """Verify that all events include a unique identifier."""
+    uids: t.Set[str] = set()
+    num_iters = 1000
+
+    # generate a bunch of events and keep track all the IDs
+    for i in range(num_iters):
+        event_a = OnCreateConsumer("test_event_uid", str(i), filters=[])
+        event_b = OnWriteFeatureStore("test_event_uid", "test_event_uid", str(i))
+
+        uids.add(event_a.uid)
+        uids.add(event_b.uid)
+
+    # verify each event created a unique ID
+    assert len(uids) == 2 * num_iters
+
+
+def test_mli_reserved_keys_conversion() -> None:
+    """Verify that conversion from a string to an enum member
+    works as expected."""
+
+    for reserved_key in ReservedKeys:
+        # iterate through all keys and verify `from_string` works
+        assert ReservedKeys.contains(reserved_key.value)
+
+        # show that the value (actual key) not the enum member name
+        # will not be incorrectly identified as reserved
+        assert not ReservedKeys.contains(str(reserved_key).split(".")[1])
+
+
+def test_mli_reserved_keys_writes() -> None:
+    """Verify that attempts to write to reserved keys are blocked from a
+    standard DragonFeatureStore but enabled with the BackboneFeatureStore."""
+
+    mock_storage = {}
+    dfs = DragonFeatureStore(mock_storage)
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    other = MemoryFeatureStore(mock_storage)
+
+    expected_value = "value"
+
+    for reserved_key in ReservedKeys:
+        # we expect every reserved key to fail using DragonFeatureStore...
+        with pytest.raises(SmartSimError) as ex:
+            dfs[reserved_key] = expected_value
+
+        assert "reserved key" in ex.value.args[0]
+
+        # ... and expect other feature stores to respect reserved keys
+        with pytest.raises(SmartSimError) as ex:
+            other[reserved_key] = expected_value
+
+        assert "reserved key" in ex.value.args[0]
+
+        # ...and those same keys to succeed on the backbone
+        backbone[reserved_key] = expected_value
+        actual_value = backbone[reserved_key]
+        assert actual_value == expected_value
+
+
+def test_mli_consumers_read_by_key() -> None:
+    """Verify that the value returned from the mli consumers method is written
+    to the correct key and reads are allowed via standard dragon feature store."""
+
+    mock_storage = {}
+    dfs = DragonFeatureStore(mock_storage)
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    other = MemoryFeatureStore(mock_storage)
+
+    expected_value = "value"
+
+    # write using backbone that has permission to write reserved keys
+    backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value
+
+    # confirm read-only access to reserved keys from any FeatureStore
+    for fs in [dfs, backbone, other]:
+        assert fs[ReservedKeys.MLI_NOTIFY_CONSUMERS] == expected_value
+
+
+def test_mli_consumers_read_by_backbone() -> None:
+    """Verify that the backbone reads the correct location
+    when using the backbone feature store API instead of mapping API."""
+
+    mock_storage = {}
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    expected_value = "value"
+
+    backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value
+
+    # confirm reading via convenience method returns expected value
+    assert backbone.notification_channels[0] == expected_value
+
+
+def test_mli_consumers_write_by_backbone() -> None:
+    """Verify that the backbone writes the correct location
+    when using the backbone feature store API instead of mapping API."""
+
+    mock_storage = {}
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    expected_value = ["value"]
+
+    backbone.notification_channels = expected_value
+
+    # confirm write using convenience method targets expected key
+    assert backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] == ",".join(expected_value)
+
+
+def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None:
+    """Verify that a broadcast operation without any registered subscribers
+    succeeds without raising Exceptions.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    mock_storage = {}
+    consumer_descriptor = storage_path / "test-consumer"
+
+    # NOTE: we're not putting any consumers into the backbone here!
+    backbone = BackboneFeatureStore(mock_storage)
+
+    event = OnCreateConsumer(
+        "test_eventpublisher_broadcast_no_factory", consumer_descriptor, filters=[]
+    )
+
+    publisher = EventBroadcaster(backbone)
+    num_receivers = 0
+
+    # publishing this event without any known consumers registered should succeed
+    # but report that it didn't have anybody to send the event to
+    consumer_descriptor = storage_path / f"test-consumer"
+    event = OnCreateConsumer(
+        "test_eventpublisher_broadcast_no_factory", consumer_descriptor, filters=[]
+    )
+
+    num_receivers += publisher.send(event)
+
+    # confirm no changes to the backbone occur when fetching the empty consumer key
+    key_in_features_store = ReservedKeys.MLI_NOTIFY_CONSUMERS in backbone
+    assert not key_in_features_store
+
+    # confirm that the broadcast reports no events published
+    assert num_receivers == 0
+    # confirm that the broadcast buffered the event for a later send
+    assert publisher.num_buffered == 1
+
+
+def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None:
+    """Verify that a broadcast operation without any registered subscribers
+    succeeds without raising Exceptions.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    mock_storage = {}
+
+    # note: file-system descriptors are just paths
+    consumer_descriptor = storage_path / "test-consumer"
+
+    # prep our backbone with a consumer list
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    backbone.notification_channels = []
+
+    event = OnCreateConsumer(
+        "test_eventpublisher_broadcast_to_empty_consumer_list",
+        consumer_descriptor,
+        filters=[],
+    )
+    publisher = EventBroadcaster(
+        backbone, channel_factory=FileSystemCommChannel.from_descriptor
+    )
+    num_receivers = publisher.send(event)
+
+    registered_consumers = backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS]
+
+    # confirm that no consumers exist in backbone to send to
+    assert not registered_consumers
+    # confirm that the broadcast reports no events published
+    assert num_receivers == 0
+    # confirm that the broadcast buffered the event for a later send
+    assert publisher.num_buffered == 1
+
+
+def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None:
+    """Verify that a broadcast operation reports an error if no channel
+    factory was supplied for constructing the consumer channels.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    mock_storage = {}
+
+    # note: file-system descriptors are just paths
+    consumer_descriptor = storage_path / "test-consumer"
+
+    # prep our backbone with a consumer list
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    backbone.notification_channels = [consumer_descriptor]
+
+    event = OnCreateConsumer(
+        "test_eventpublisher_broadcast_without_channel_factory",
+        consumer_descriptor,
+        filters=[],
+    )
+    publisher = EventBroadcaster(
+        backbone,
+        # channel_factory=FileSystemCommChannel.from_descriptor # <--- not supplied
+    )
+
+    with pytest.raises(SmartSimError) as ex:
+        publisher.send(event)
+
+    assert "factory" in ex.value.args[0]
+
+
+def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None:
+    """Verify that a successful broadcast clears messages from the event
+    buffer when a new message is sent and consumers are registered.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    mock_storage = {}
+
+    # note: file-system descriptors are just paths
+    consumer_descriptor = storage_path / "test-consumer"
+
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    backbone.notification_channels = (consumer_descriptor,)
+
+    publisher = EventBroadcaster(
+        backbone, channel_factory=FileSystemCommChannel.from_descriptor
+    )
+
+    # mock building up some buffered events
+    num_buffered_events = 14
+    for i in range(num_buffered_events):
+        event = OnCreateConsumer(
+            "test_eventpublisher_broadcast_empties_buffer",
+            storage_path / f"test-consumer-{str(i)}",
+            [],
+        )
+        publisher._event_buffer.append(bytes(event))
+
+    event0 = OnCreateConsumer(
+        "test_eventpublisher_broadcast_empties_buffer",
+        storage_path / f"test-consumer-{str(num_buffered_events + 1)}",
+        [],
+    )
+
+    num_receivers = publisher.send(event0)
+    # 1 receiver x 15 total events == 15 events
+    assert num_receivers == num_buffered_events + 1
+
+
+@pytest.mark.parametrize(
+    "num_consumers, num_buffered, expected_num_sent",
+    [
+        pytest.param(0, 7, 0, id="0 x (7+1) - no consumers, multi-buffer"),
+        pytest.param(1, 7, 8, id="1 x (7+1) - single consumer, multi-buffer"),
+        pytest.param(2, 7, 16, id="2 x (7+1) - multi-consumer, multi-buffer"),
+        pytest.param(4, 4, 20, id="4 x (4+1) - multi-consumer, multi-buffer (odd #)"),
+        pytest.param(9, 0, 9, id="13 x (0+1) - multi-consumer, empty buffer"),
+    ],
+)
+def test_eventpublisher_broadcast_returns_total_sent(
+    test_dir: str, num_consumers: int, num_buffered: int, expected_num_sent: int
+) -> None:
+    """Verify that a successful broadcast returns the total number of events
+    sent, including buffered messages.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    :param num_consumers: the number of consumers to mock setting up prior to send
+    :param num_buffered: the number of pre-buffered events to mock up
+    :param expected_num_sent: the expected result from calling send
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    mock_storage = {}
+
+    # note: file-system descriptors are just paths
+    consumers = []
+    for i in range(num_consumers):
+        consumers.append(storage_path / f"test-consumer-{i}")
+
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    backbone.notification_channels = consumers
+
+    publisher = EventBroadcaster(
+        backbone, channel_factory=FileSystemCommChannel.from_descriptor
+    )
+
+    # mock building up some buffered events
+    for i in range(num_buffered):
+        event = OnCreateConsumer(
+            "test_eventpublisher_broadcast_returns_total_sent",
+            storage_path / f"test-consumer-{str(i)}",
+            [],
+        )
+        publisher._event_buffer.append(bytes(event))
+
+    assert publisher.num_buffered == num_buffered
+
+    # this event will trigger clearing anything already in buffer
+    event0 = OnCreateConsumer(
+        "test_eventpublisher_broadcast_returns_total_sent",
+        storage_path / f"test-consumer-{num_buffered}",
+        [],
+    )
+
+    # num_receivers should contain a number that computes w/all consumers and all events
+    num_receivers = publisher.send(event0)
+
+    assert num_receivers == expected_num_sent
+
+
+def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None:
+    """Verify that any unused consumers are pruned each time a new event is sent.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    mock_storage = {}
+
+    # note: file-system descriptors are just paths
+    consumer_descriptor = storage_path / "test-consumer"
+
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+
+    publisher = EventBroadcaster(
+        backbone, channel_factory=FileSystemCommChannel.from_descriptor
+    )
+
+    event = OnCreateConsumer(
+        "test_eventpublisher_prune_unused_consumer",
+        consumer_descriptor,
+        filters=[],
+    )
+
+    # the only registered cnosumer is in the event, expect no pruning
+    backbone.notification_channels = (consumer_descriptor,)
+
+    publisher.send(event)
+    assert str(consumer_descriptor) in publisher._channel_cache
+    assert len(publisher._channel_cache) == 1
+
+    # add a new descriptor for another event...
+    consumer_descriptor2 = storage_path / "test-consumer-2"
+    # ... and remove the old descriptor from the backbone when it's looked up
+    backbone.notification_channels = (consumer_descriptor2,)
+
+    event = OnCreateConsumer(
+        "test_eventpublisher_prune_unused_consumer", consumer_descriptor2, filters=[]
+    )
+
+    publisher.send(event)
+
+    assert str(consumer_descriptor2) in publisher._channel_cache
+    assert str(consumer_descriptor) not in publisher._channel_cache
+    assert len(publisher._channel_cache) == 1
+
+    # test multi-consumer pruning by caching some extra channels
+    prune0, prune1, prune2 = "abc", "def", "ghi"
+    publisher._channel_cache[prune0] = "doesnt-matter-if-it-is-pruned"
+    publisher._channel_cache[prune1] = "doesnt-matter-if-it-is-pruned"
+    publisher._channel_cache[prune2] = "doesnt-matter-if-it-is-pruned"
+
+    # add in one of our old channels so we prune the above items, send to these
+    backbone.notification_channels = (consumer_descriptor, consumer_descriptor2)
+
+    publisher.send(event)
+
+    assert str(consumer_descriptor2) in publisher._channel_cache
+
+    # NOTE: we should NOT prune something that isn't used by this message but
+    # does appear in `backbone.notification_channels`
+    assert str(consumer_descriptor) in publisher._channel_cache
+
+    # confirm all of our items that were not in the notification channels are gone
+    for pruned in [prune0, prune1, prune2]:
+        assert pruned not in publisher._channel_cache
+
+    # confirm we have only the two expected items in the channel cache
+    assert len(publisher._channel_cache) == 2
+
+
+def test_eventpublisher_serialize_failure(
+    test_dir: str, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that errors during message serialization are raised to the caller.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    :param monkeypatch: pytest fixture for modifying behavior of existing code
+    with mock implementations
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    storage_path.mkdir(parents=True, exist_ok=True)
+
+    mock_storage = {}
+
+    # note: file-system descriptors are just paths
+    target_descriptor = str(storage_path / "test-consumer")
+
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    publisher = EventBroadcaster(
+        backbone, channel_factory=FileSystemCommChannel.from_descriptor
+    )
+
+    with monkeypatch.context() as patch:
+        event = OnCreateConsumer(
+            "test_eventpublisher_serialize_failure", target_descriptor, filters=[]
+        )
+
+        # patch the __bytes__ implementation to cause pickling to fail during send
+        def bad_bytes(self) -> bytes:
+            return b"abc"
+
+        # this patch causes an attribute error when event pickling is attempted
+        patch.setattr(event, "__bytes__", bad_bytes)
+
+        backbone.notification_channels = (target_descriptor,)
+
+        # send a message into the channel
+        with pytest.raises(AttributeError) as ex:
+            publisher.send(event)
+
+        assert "serialize" in ex.value.args[0]
+
+
+def test_eventpublisher_factory_failure(
+    test_dir: str, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that errors during channel construction are raised to the caller.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    :param monkeypatch: pytest fixture for modifying behavior of existing code
+    with mock implementations
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    storage_path.mkdir(parents=True, exist_ok=True)
+
+    mock_storage = {}
+
+    # note: file-system descriptors are just paths
+    target_descriptor = str(storage_path / "test-consumer")
+
+    def boom(descriptor: str) -> None:
+        raise Exception(f"you shall not pass! {descriptor}")
+
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    publisher = EventBroadcaster(backbone, channel_factory=boom)
+
+    with monkeypatch.context() as patch:
+        event = OnCreateConsumer(
+            "test_eventpublisher_factory_failure", target_descriptor, filters=[]
+        )
+
+        backbone.notification_channels = (target_descriptor,)
+
+        # send a message into the channel
+        with pytest.raises(SmartSimError) as ex:
+            publisher.send(event)
+
+        assert "construct" in ex.value.args[0]
+
+
+def test_eventpublisher_failure(test_dir: str, monkeypatch: pytest.MonkeyPatch) -> None:
+    """Verify that unexpected errors during message send are caught and wrapped in a
+    SmartSimError so they are not propagated directly to the caller.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    :param monkeypatch: pytest fixture for modifying behavior of existing code
+    with mock implementations
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    storage_path.mkdir(parents=True, exist_ok=True)
+
+    mock_storage = {}
+
+    # note: file-system descriptors are just paths
+    target_descriptor = str(storage_path / "test-consumer")
+
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    publisher = EventBroadcaster(
+        backbone, channel_factory=FileSystemCommChannel.from_descriptor
+    )
+
+    def boom(self) -> None:
+        raise Exception("That was unexpected...")
+
+    with monkeypatch.context() as patch:
+        event = OnCreateConsumer(
+            "test_eventpublisher_failure", target_descriptor, filters=[]
+        )
+
+        # patch the _broadcast implementation to cause send to fail after
+        # after the event has been pickled
+        patch.setattr(publisher, "_broadcast", boom)
+
+        backbone.notification_channels = (target_descriptor,)
+
+        # Here, we see the exception raised by broadcast that isn't expected
+        # is not allowed directly out, and instead is wrapped in SmartSimError
+        with pytest.raises(SmartSimError) as ex:
+            publisher.send(event)
+
+        assert "unexpected" in ex.value.args[0]
+
+
+def test_eventconsumer_receive(test_dir: str) -> None:
+    """Verify that a consumer retrieves a message from the given channel.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    storage_path.mkdir(parents=True, exist_ok=True)
+
+    mock_storage = {}
+
+    # note: file-system descriptors are just paths
+    target_descriptor = str(storage_path / "test-consumer")
+
+    backbone = BackboneFeatureStore(mock_storage)
+    comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor)
+    event = OnCreateConsumer(
+        "test_eventconsumer_receive", target_descriptor, filters=[]
+    )
+
+    # simulate a sent event by writing directly to the input comm channel
+    comm_channel.send(bytes(event))
+
+    consumer = EventConsumer(comm_channel, backbone)
+
+    all_received: t.List[OnCreateConsumer] = consumer.recv()
+    assert len(all_received) == 1
+
+    # verify we received the same event that was raised
+    assert all_received[0].category == event.category
+    assert all_received[0].descriptor == event.descriptor
+
+
+@pytest.mark.parametrize("num_sent", [0, 1, 2, 4, 8, 16])
+def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None:
+    """Verify that a consumer retrieves multiple message from the given channel.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    :param num_sent: parameterized value used to vary the number of events
+    that are enqueued and validations are checked at multiple queue sizes
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    storage_path.mkdir(parents=True, exist_ok=True)
+
+    mock_storage = {}
+
+    # note: file-system descriptors are just paths
+    target_descriptor = str(storage_path / "test-consumer")
+
+    backbone = BackboneFeatureStore(mock_storage)
+    comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor)
+
+    # simulate multiple sent events by writing directly to the input comm channel
+    for _ in range(num_sent):
+        event = OnCreateConsumer(
+            "test_eventconsumer_receive_multi", target_descriptor, filters=[]
+        )
+        comm_channel.send(bytes(event))
+
+    consumer = EventConsumer(comm_channel, backbone)
+
+    all_received: t.List[OnCreateConsumer] = consumer.recv()
+    assert len(all_received) == num_sent
+
+
+def test_eventconsumer_receive_empty(test_dir: str) -> None:
+    """Verify that a consumer receiving an empty message ignores the
+    message and continues processing.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    storage_path.mkdir(parents=True, exist_ok=True)
+
+    mock_storage = {}
+
+    # note: file-system descriptors are just paths
+    target_descriptor = str(storage_path / "test-consumer")
+
+    backbone = BackboneFeatureStore(mock_storage)
+    comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor)
+
+    # simulate a sent event by writing directly to the input comm channel
+    comm_channel.send(bytes(b""))
+
+    consumer = EventConsumer(comm_channel, backbone)
+
+    messages = consumer.recv()
+
+    # the messages array should be empty
+    assert not messages
+
+
+def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None:
+    """Verify that the publisher and consumer integrate as expected when
+    multiple publishers and consumers are sending simultaneously.
+
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    storage_path.mkdir(parents=True, exist_ok=True)
+
+    mock_storage = {}
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+    mock_fs_descriptor = str(storage_path / f"mock-feature-store")
+
+    wmgr_channel = FileSystemCommChannel(storage_path / "test-wmgr")
+    capp_channel = FileSystemCommChannel(storage_path / "test-capp")
+    back_channel = FileSystemCommChannel(storage_path / "test-backend")
+
+    wmgr_consumer_descriptor = wmgr_channel.descriptor
+    capp_consumer_descriptor = capp_channel.descriptor
+    back_consumer_descriptor = back_channel.descriptor
+
+    # create some consumers to receive messages
+    wmgr_consumer = EventConsumer(
+        wmgr_channel,
+        backbone,
+        filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN],
+    )
+    capp_consumer = EventConsumer(
+        capp_channel,
+        backbone,
+    )
+    back_consumer = EventConsumer(
+        back_channel,
+        backbone,
+        filters=[OnCreateConsumer.CONSUMER_CREATED],
+    )
+
+    # create some broadcasters to publish messages
+    mock_worker_mgr = EventBroadcaster(
+        backbone,
+        channel_factory=FileSystemCommChannel.from_descriptor,
+    )
+    mock_client_app = EventBroadcaster(
+        backbone,
+        channel_factory=FileSystemCommChannel.from_descriptor,
+    )
+
+    # register all of the consumers even though the OnCreateConsumer really should
+    # trigger its registration. event processing is tested elsewhere.
+    backbone.notification_channels = [
+        wmgr_consumer_descriptor,
+        capp_consumer_descriptor,
+        back_consumer_descriptor,
+    ]
+
+    # simulate worker manager sending a notification to backend that it's alive
+    event_1 = OnCreateConsumer(
+        "test_eventconsumer_eventpublisher_integration",
+        wmgr_consumer_descriptor,
+        filters=[],
+    )
+    mock_worker_mgr.send(event_1)
+
+    # simulate the app updating a model a few times
+    event_2 = OnWriteFeatureStore(
+        "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-1"
+    )
+    event_3 = OnWriteFeatureStore(
+        "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-2"
+    )
+    event_4 = OnWriteFeatureStore(
+        "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-1"
+    )
+
+    mock_client_app.send(event_2)
+    mock_client_app.send(event_3)
+    mock_client_app.send(event_4)
+
+    # worker manager should only get updates about feature update
+    wmgr_messages = wmgr_consumer.recv()
+    assert len(wmgr_messages) == 3
+
+    # the backend should only receive messages about consumer creation
+    back_messages = back_consumer.recv()
+    assert len(back_messages) == 1
+
+    # hypothetical app has no filters and will get all events
+    app_messages = capp_consumer.recv()
+    assert len(app_messages) == 4
+
+
+@pytest.mark.parametrize("invalid_timeout", [-100.0, -1.0, 0.0])
+def test_eventconsumer_batch_timeout(
+    invalid_timeout: float,
+    test_dir: str,
+) -> None:
+    """Verify that a consumer allows only positive, non-zero values for timeout
+    if it is supplied.
+
+    :param invalid_timeout: any invalid timeout that should fail validation
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs
+    """
+    storage_path = pathlib.Path(test_dir) / "features"
+    storage_path.mkdir(parents=True, exist_ok=True)
+
+    mock_storage = {}
+    backbone = BackboneFeatureStore(mock_storage)
+
+    channel = FileSystemCommChannel(storage_path / "test-wmgr")
+
+    with pytest.raises(ValueError) as ex:
+        # try to create a consumer w/a max recv size of 0
+        consumer = EventConsumer(
+            channel,
+            backbone,
+            filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN],
+        )
+        consumer.recv(batch_timeout=invalid_timeout)
+
+    assert "positive" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "wait_timeout, exp_wait_max",
+    [
+        # aggregate the 1+1+1 into 3 on remaining parameters
+        pytest.param(1, 1 + 1 + 1, id="1s wait, 3 cycle steps"),
+        pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"),
+        pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"),
+        pytest.param(9, 3 + 2 + 4 + 8, id="9s wait, 6 cycle steps"),
+        # aggregate an entire cycle into 16
+        pytest.param(19.5, 16 + 3 + 2 + 4, id="20s wait, repeat cycle"),
+    ],
+)
+def test_backbone_wait_timeout(wait_timeout: float, exp_wait_max: float) -> None:
+    """Verify that attempts to attach to the worker queue from the protoclient
+    timeout in an appropriate amount of time. Note: due to the backoff, we verify
+    the elapsed time is less than the 15s of a cycle of waits.
+
+    :param wait_timeout: Maximum amount of time (in seconds) to allow the backbone
+    to wait for the requested value to exist
+    :param exp_wait_max: Maximum amount of time (in seconds) to set as the upper
+    bound to allow the delays with backoff to occur
+    :param storage_for_dragon_fs: the dragon storage engine to use
+    """
+
+    # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8]
+    # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
+    start_time = time.time()
+
+    storage = {}
+    backbone = BackboneFeatureStore(storage)
+
+    with pytest.raises(SmartSimError) as ex:
+        backbone.wait_for(["does-not-exist"], wait_timeout)
+
+    assert "timeout" in str(ex.value.args[0]).lower()
+
+    end_time = time.time()
+    elapsed = end_time - start_time
+
+    # confirm that we met our timeout
+    assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}"
+
+    # confirm that the total wait time is aligned with the sleep cycle
+    assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}"
diff --git a/tests/dragon_wlm/test_featurestore_integration.py b/tests/dragon_wlm/test_featurestore_integration.py
new file mode 100644
index 0000000000..23fdc55ab6
--- /dev/null
+++ b/tests/dragon_wlm/test_featurestore_integration.py
@@ -0,0 +1,213 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_util import (
+    DEFAULT_CHANNEL_BUFFER_SIZE,
+    create_local,
+)
+from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster
+from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer
+from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+
+# isort: off
+from dragon.channels import Channel
+
+# isort: on
+
+if t.TYPE_CHECKING:
+    import conftest
+
+
+# The tests in this file must run in a dragon environment
+pytestmark = pytest.mark.dragon
+
+
+@pytest.fixture(scope="module")
+def the_worker_channel() -> DragonCommChannel:
+    """Fixture to create a valid descriptor for a worker channel
+    that can be attached to."""
+    wmgr_channel_ = create_local()
+    wmgr_channel = DragonCommChannel(wmgr_channel_)
+    return wmgr_channel
+
+
+@pytest.mark.parametrize(
+    "num_events, batch_timeout, max_batches_expected",
+    [
+        pytest.param(1, 1.0, 2, id="under 1s timeout"),
+        pytest.param(20, 1.0, 3, id="test 1s timeout 20x"),
+        pytest.param(30, 0.2, 5, id="test 0.2s timeout 30x"),
+        pytest.param(60, 0.4, 4, id="small batches"),
+        pytest.param(100, 0.1, 10, id="many small batches"),
+    ],
+)
+def test_eventconsumer_max_dequeue(
+    num_events: int,
+    batch_timeout: float,
+    max_batches_expected: int,
+    the_worker_channel: DragonCommChannel,
+    the_backbone: BackboneFeatureStore,
+) -> None:
+    """Verify that a consumer does not sit and collect messages indefinitely
+    by checking that a consumer returns after a maximum timeout is exceeded.
+
+    :param num_events: Total number of events to raise in the test
+    :param batch_timeout: Maximum wait time (in seconds) for a message to be sent
+    :param max_batches_expected: Maximum number of receives that should occur
+    :param the_storage: Dragon storage engine to use
+    """
+
+    # create some consumers to receive messages
+    wmgr_consumer = EventConsumer(
+        the_worker_channel,
+        the_backbone,
+        filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN],
+    )
+
+    # create a broadcaster to publish messages
+    mock_client_app = EventBroadcaster(
+        the_backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+
+    # register all of the consumers even though the OnCreateConsumer really should
+    # trigger its registration. event processing is tested elsewhere.
+    the_backbone.notification_channels = [the_worker_channel.descriptor]
+
+    # simulate the app updating a model a lot of times
+    for key in (f"key-{i}" for i in range(num_events)):
+        event = OnWriteFeatureStore(
+            "test_eventconsumer_max_dequeue", the_backbone.descriptor, key
+        )
+        mock_client_app.send(event, timeout=0.01)
+
+    num_dequeued = 0
+    num_batches = 0
+
+    while wmgr_messages := wmgr_consumer.recv(
+        timeout=0.1,
+        batch_timeout=batch_timeout,
+    ):
+        # worker manager should not get more than `max_num_msgs` events
+        num_dequeued += len(wmgr_messages)
+        num_batches += 1
+
+    # make sure we made all the expected dequeue calls and got everything
+    assert num_dequeued == num_events
+    assert num_batches > 0
+    assert num_batches < max_batches_expected, "too many recv calls were made"
+
+
+@pytest.mark.parametrize(
+    "buffer_size",
+    [
+        pytest.param(
+            -1,
+            id="replace negative, default to 500",
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
+        ),
+        pytest.param(
+            0,
+            id="replace zero, default to 500",
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
+        ),
+        pytest.param(
+            1,
+            id="non-zero buffer size: 1",
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
+        ),
+        # pytest.param(500, id="maximum size edge case: 500"),
+        pytest.param(
+            550,
+            id="larger than default: 550",
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
+        ),
+        pytest.param(
+            800,
+            id="much larger then default: 800",
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
+        ),
+        pytest.param(
+            1000,
+            id="very large buffer: 1000, unreliable in dragon-v0.10",
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
+        ),
+    ],
+)
+def test_channel_buffer_size(
+    buffer_size: int,
+    the_storage: t.Any,
+) -> None:
+    """Verify that a channel used by an EventBroadcaster can buffer messages
+    until a configured maximum value is exceeded.
+
+    :param buffer_size: Maximum number of messages allowed in a channel buffer
+    :param the_storage: The dragon storage engine to use
+    """
+
+    mock_storage = the_storage
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+
+    wmgr_channel_ = create_local(buffer_size)  # <--- vary buffer size
+    wmgr_channel = DragonCommChannel(wmgr_channel_)
+    wmgr_consumer_descriptor = wmgr_channel.descriptor
+
+    # create a broadcaster to publish messages. create no consumers to
+    # push the number of sent messages past the allotted buffer size
+    mock_client_app = EventBroadcaster(
+        backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+
+    # register all of the consumers even though the OnCreateConsumer really should
+    # trigger its registration. event processing is tested elsewhere.
+    backbone.notification_channels = [wmgr_consumer_descriptor]
+
+    if buffer_size < 1:
+        # NOTE: we set this after creating the channel above to ensure
+        # the default parameter value was used during instantiation
+        buffer_size = DEFAULT_CHANNEL_BUFFER_SIZE
+
+    # simulate the app updating a model a lot of times
+    for key in (f"key-{i}" for i in range(buffer_size)):
+        event = OnWriteFeatureStore(
+            "test_channel_buffer_size", backbone.descriptor, key
+        )
+        mock_client_app.send(event, timeout=0.01)
+
+    # adding 1 more over the configured buffer size should report the error
+    with pytest.raises(Exception) as ex:
+        mock_client_app.send(event, timeout=0.01)
diff --git a/tests/dragon_wlm/test_inference_reply.py b/tests/dragon_wlm/test_inference_reply.py
new file mode 100644
index 0000000000..bdc7be14bc
--- /dev/null
+++ b/tests/dragon_wlm/test_inference_reply.py
@@ -0,0 +1,76 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.infrastructure.storage.feature_store import TensorKey
+from smartsim._core.mli.infrastructure.worker.worker import InferenceReply
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+handler = MessageHandler()
+
+
+@pytest.fixture
+def inference_reply() -> InferenceReply:
+    return InferenceReply()
+
+
+@pytest.fixture
+def fs_key() -> TensorKey:
+    return TensorKey("key", "descriptor")
+
+
+@pytest.mark.parametrize(
+    "outputs, expected",
+    [
+        ([b"output bytes"], True),
+        (None, False),
+        ([], False),
+    ],
+)
+def test_has_outputs(monkeypatch, inference_reply, outputs, expected):
+    """Test the has_outputs property with different values for outputs."""
+    monkeypatch.setattr(inference_reply, "outputs", outputs)
+    assert inference_reply.has_outputs == expected
+
+
+@pytest.mark.parametrize(
+    "output_keys, expected",
+    [
+        ([fs_key], True),
+        (None, False),
+        ([], False),
+    ],
+)
+def test_has_output_keys(monkeypatch, inference_reply, output_keys, expected):
+    """Test the has_output_keys property with different values for output_keys."""
+    monkeypatch.setattr(inference_reply, "output_keys", output_keys)
+    assert inference_reply.has_output_keys == expected
diff --git a/tests/dragon_wlm/test_inference_request.py b/tests/dragon_wlm/test_inference_request.py
new file mode 100644
index 0000000000..f5c8b9bdc7
--- /dev/null
+++ b/tests/dragon_wlm/test_inference_request.py
@@ -0,0 +1,118 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.infrastructure.storage.feature_store import TensorKey
+from smartsim._core.mli.infrastructure.worker.worker import InferenceRequest
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+handler = MessageHandler()
+
+
+@pytest.fixture
+def inference_request() -> InferenceRequest:
+    return InferenceRequest()
+
+
+@pytest.fixture
+def fs_key() -> TensorKey:
+    return TensorKey("key", "descriptor")
+
+
+@pytest.mark.parametrize(
+    "raw_model, expected",
+    [
+        (handler.build_model(b"bytes", "Model Name", "V1"), True),
+        (None, False),
+    ],
+)
+def test_has_raw_model(monkeypatch, inference_request, raw_model, expected):
+    """Test the has_raw_model property with different values for raw_model."""
+    monkeypatch.setattr(inference_request, "raw_model", raw_model)
+    assert inference_request.has_raw_model == expected
+
+
+@pytest.mark.parametrize(
+    "model_key, expected",
+    [
+        (fs_key, True),
+        (None, False),
+    ],
+)
+def test_has_model_key(monkeypatch, inference_request, model_key, expected):
+    """Test the has_model_key property with different values for model_key."""
+    monkeypatch.setattr(inference_request, "model_key", model_key)
+    assert inference_request.has_model_key == expected
+
+
+@pytest.mark.parametrize(
+    "raw_inputs, expected",
+    [([b"raw input bytes"], True), (None, False), ([], False)],
+)
+def test_has_raw_inputs(monkeypatch, inference_request, raw_inputs, expected):
+    """Test the has_raw_inputs property with different values for raw_inputs."""
+    monkeypatch.setattr(inference_request, "raw_inputs", raw_inputs)
+    assert inference_request.has_raw_inputs == expected
+
+
+@pytest.mark.parametrize(
+    "input_keys, expected",
+    [([fs_key], True), (None, False), ([], False)],
+)
+def test_has_input_keys(monkeypatch, inference_request, input_keys, expected):
+    """Test the has_input_keys property with different values for input_keys."""
+    monkeypatch.setattr(inference_request, "input_keys", input_keys)
+    assert inference_request.has_input_keys == expected
+
+
+@pytest.mark.parametrize(
+    "output_keys, expected",
+    [([fs_key], True), (None, False), ([], False)],
+)
+def test_has_output_keys(monkeypatch, inference_request, output_keys, expected):
+    """Test the has_output_keys property with different values for output_keys."""
+    monkeypatch.setattr(inference_request, "output_keys", output_keys)
+    assert inference_request.has_output_keys == expected
+
+
+@pytest.mark.parametrize(
+    "input_meta, expected",
+    [
+        ([handler.build_tensor_descriptor("c", "float32", [1, 2, 3])], True),
+        (None, False),
+        ([], False),
+    ],
+)
+def test_has_input_meta(monkeypatch, inference_request, input_meta, expected):
+    """Test the has_input_meta property with different values for input_meta."""
+    monkeypatch.setattr(inference_request, "input_meta", input_meta)
+    assert inference_request.has_input_meta == expected
diff --git a/tests/dragon_wlm/test_protoclient.py b/tests/dragon_wlm/test_protoclient.py
new file mode 100644
index 0000000000..f84417107d
--- /dev/null
+++ b/tests/dragon_wlm/test_protoclient.py
@@ -0,0 +1,313 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import pickle
+import time
+import typing as t
+from unittest.mock import MagicMock
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster
+from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+# isort: off
+from dragon import fli
+from dragon.data.ddict.ddict import DDict
+
+# from ..ex..high_throughput_inference.mock_app import ProtoClient
+from smartsim._core.mli.client.protoclient import ProtoClient
+
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+WORK_QUEUE_KEY = BackboneFeatureStore.MLI_WORKER_QUEUE
+logger = get_logger(__name__)
+
+
+@pytest.fixture(scope="module")
+def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
+    """Fixture that creates a dragon FLI channel as a stand-in for the
+    worker queue created by the worker.
+
+    :param the_backbone: The backbone feature store to update
+    with the worker queue descriptor.
+    :returns: The attached `DragonFLIChannel`
+    """
+
+    # create the FLI
+    to_worker_channel = create_local()
+    fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    comm_channel = DragonFLIChannel(fli_)
+
+    # store the descriptor in the backbone
+    the_backbone.worker_queue = comm_channel.descriptor
+
+    try:
+        comm_channel.send(b"foo")
+    except Exception as ex:
+        logger.exception(f"Test send from worker channel failed", exc_info=True)
+
+    return comm_channel
+
+
+@pytest.mark.parametrize(
+    "backbone_timeout, exp_wait_max",
+    [
+        # aggregate the 1+1+1 into 3 on remaining parameters
+        pytest.param(0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps"),
+        pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"),
+        pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"),
+    ],
+)
+def test_protoclient_timeout(
+    backbone_timeout: float,
+    exp_wait_max: float,
+    the_backbone: BackboneFeatureStore,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Verify that attempts to attach to the worker queue from the protoclient
+    timeout in an appropriate amount of time. Note: due to the backoff, we verify
+    the elapsed time is less than the 15s of a cycle of waits.
+
+    :param backbone_timeout: a timeout for use when configuring a proto client
+    :param exp_wait_max: a ceiling for the expected time spent waiting for
+    the timeout
+    :param the_backbone: a pre-initialized backbone featurestore for setting up
+    the environment variable required by the client
+    """
+
+    # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8]
+    # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
+
+    with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex:
+        start_time = time.time()
+        # remove the worker queue value from the backbone if it exists
+        # to ensure the timeout occurs
+        the_backbone.pop(BackboneFeatureStore.MLI_WORKER_QUEUE)
+
+        ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
+
+        ProtoClient(timing_on=False, backbone_timeout=backbone_timeout)
+        elapsed = time.time() - start_time
+        logger.info(f"ProtoClient timeout occurred in {elapsed} seconds")
+
+        # confirm that we met our timeout
+        assert (
+            elapsed >= backbone_timeout
+        ), f"below configured timeout {backbone_timeout}"
+
+        # confirm that the total wait time is aligned with the sleep cycle
+        assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}"
+
+
+def test_protoclient_initialization_no_backbone(
+    monkeypatch: pytest.MonkeyPatch, the_worker_queue: DragonFLIChannel
+):
+    """Verify that attempting to start the client without required environment variables
+    results in an exception.
+
+    :param the_worker_queue: Passing the worker queue fixture to ensure
+    the worker queue environment is correctly configured.
+
+    NOTE: os.environ[BackboneFeatureStore.MLI_BACKBONE] is not set"""
+
+    with monkeypatch.context() as patch, pytest.raises(SmartSimError) as ex:
+        patch.setenv(BackboneFeatureStore.MLI_BACKBONE, "")
+
+        ProtoClient(timing_on=False)
+
+    # confirm the missing value error has been raised
+    assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" ")))
+
+
+def test_protoclient_initialization(
+    the_backbone: BackboneFeatureStore,
+    the_worker_queue: DragonFLIChannel,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Verify that attempting to start the client with required env vars results
+    in a fully initialized client.
+
+    :param the_backbone: a pre-initialized backbone featurestore
+    :param the_worker_queue: an FLI channel the client will retrieve
+    from the backbone"""
+
+    with monkeypatch.context() as ctx:
+        ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
+        # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone
+
+        client = ProtoClient(timing_on=False)
+
+        fs_descriptor = the_backbone.descriptor
+        wq_descriptor = the_worker_queue.descriptor
+
+        # confirm the backbone was attached correctly
+        assert client._backbone is not None
+        assert client._backbone.descriptor == fs_descriptor
+
+        # we expect the backbone to add its descriptor to the local env
+        assert os.environ[BackboneFeatureStore.MLI_BACKBONE] == fs_descriptor
+
+        # confirm the worker queue is created and attached correctly
+        assert client._to_worker_fli is not None
+        assert client._to_worker_fli.descriptor == wq_descriptor
+
+        # we expect the worker queue descriptor to be placed into the backbone
+        # we do NOT expect _from_worker_ch to be placed anywhere. it's a specific callback
+        assert the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] == wq_descriptor
+
+        # confirm the worker channels are created
+        assert client._from_worker_ch is not None
+        assert client._to_worker_ch is not None
+
+        # wrap the channels just to easily verify they produces a descriptor
+        assert DragonCommChannel(client._from_worker_ch).descriptor
+        assert DragonCommChannel(client._to_worker_ch).descriptor
+
+        # confirm a publisher is created
+        assert client._publisher is not None
+
+
+def test_protoclient_write_model(
+    the_backbone: BackboneFeatureStore,
+    the_worker_queue: DragonFLIChannel,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Verify that writing a model using the client causes the model data to be
+    written to a feature store.
+
+    :param the_backbone: a pre-initialized backbone featurestore
+    :param the_worker_queue: Passing the worker queue fixture to ensure
+    the worker queue environment is correctly configured.
+    from the backbone
+    """
+
+    with monkeypatch.context() as ctx:
+        # we won't actually send here
+        client = ProtoClient(timing_on=False)
+
+        ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
+        # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone
+
+        client = ProtoClient(timing_on=False)
+
+        model_key = "my-model"
+        model_bytes = b"12345"
+
+        client.set_model(model_key, model_bytes)
+
+        # confirm the client modified the underlying feature store
+        assert client._backbone[model_key] == model_bytes
+
+
+@pytest.mark.parametrize(
+    "num_listeners, num_model_updates",
+    [(1, 1), (1, 4), (2, 4), (16, 4), (64, 8)],
+)
+def test_protoclient_write_model_notification_sent(
+    the_backbone: BackboneFeatureStore,
+    the_worker_queue: DragonFLIChannel,
+    monkeypatch: pytest.MonkeyPatch,
+    num_listeners: int,
+    num_model_updates: int,
+):
+    """Verify that writing a model sends a key-written event.
+
+    :param the_backbone: a pre-initialized backbone featurestore
+    :param the_worker_queue: an FLI channel the client will retrieve
+    from the backbone
+    :param num_listeners: vary the number of registered listeners
+    to verify that the event is broadcast to everyone
+    :param num_listeners: vary the number of listeners to register
+    to verify the broadcast counts messages sent correctly
+    """
+
+    # we won't actually send here, but it won't try without registered listeners
+    listeners = [f"mock-ch-desc-{i}" for i in range(num_listeners)]
+
+    the_backbone[BackboneFeatureStore.MLI_BACKBONE] = the_backbone.descriptor
+    the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_queue.descriptor
+    the_backbone[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS] = ",".join(listeners)
+    the_backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = None
+
+    with monkeypatch.context() as ctx:
+        ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
+        # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone
+
+        client = ProtoClient(timing_on=False)
+
+        publisher = t.cast(EventBroadcaster, client._publisher)
+
+        # mock attaching to a channel given the mock-ch-desc in backbone
+        mock_send = MagicMock(return_value=None)
+        mock_comm_channel = MagicMock(**{"send": mock_send}, spec=DragonCommChannel)
+        mock_get_comm_channel = MagicMock(return_value=mock_comm_channel)
+        ctx.setattr(publisher, "_get_comm_channel", mock_get_comm_channel)
+
+        model_key = "my-model"
+        model_bytes = b"12345"
+
+        for i in range(num_model_updates):
+            client.set_model(model_key, model_bytes)
+
+        # confirm that a listener channel was attached
+        # once for each registered listener in backbone
+        assert mock_get_comm_channel.call_count == num_listeners * num_model_updates
+
+        # confirm the client raised the key-written event
+        assert (
+            mock_send.call_count == num_listeners * num_model_updates
+        ), f"Expected {num_listeners} sends with {num_listeners} registrations"
+
+        # with at least 1 consumer registered, we can verify the message is sent
+        for call_args in mock_send.call_args_list:
+            send_args = call_args.args
+            event_bytes, timeout = send_args[0], send_args[1]
+
+            assert event_bytes, "Expected event bytes to be supplied to send"
+            assert (
+                timeout == 0.001
+            ), "Expected default timeout on call to `publisher.send`, "
+
+            # confirm the correct event was raised
+            event = t.cast(
+                OnWriteFeatureStore,
+                pickle.loads(event_bytes),
+            )
+            assert event.descriptor == the_backbone.descriptor
+            assert event.key == model_key
diff --git a/tests/dragon_wlm/test_reply_building.py b/tests/dragon_wlm/test_reply_building.py
new file mode 100644
index 0000000000..1b0074ca0e
--- /dev/null
+++ b/tests/dragon_wlm/test_reply_building.py
@@ -0,0 +1,88 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+<<<<<<< HEAD:smartsim/entity/_mock.py
+"""This module contains stubs of functionality that is not currently
+implemented. 
+
+THIS WHOLE MODULE SHOULD BE REMOVED IN FUTURE!!
+"""
+
+from __future__ import annotations
+
+import typing as t
+=======
+import typing as t
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.infrastructure.control.worker_manager import build_failure_reply
+
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+>>>>>>> 5bdafc5f93fd56bf94ca5a7979a28f185c7c7ebf:tests/dragon_wlm/test_reply_building.py
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+<<<<<<< HEAD:smartsim/entity/_mock.py
+class Mock:
+    """Base mock class"""
+
+    def __init__(self, *_: t.Any, **__: t.Any): ...
+    def __getattr__(self, _: str) -> Mock:
+        return type(self)()
+
+    def __deepcopy__(self, _: dict[t.Any, t.Any]) -> Mock:
+        return type(self)()
+=======
+
+@pytest.mark.parametrize(
+    "status, message",
+    [
+        pytest.param("timeout", "Worker timed out", id="timeout"),
+        pytest.param("fail", "Failed while executing", id="fail"),
+    ],
+)
+def test_build_failure_reply(status: "Status", message: str):
+    "Ensures failure replies can be built successfully"
+    response = build_failure_reply(status, message)
+    display_name = response.schema.node.displayName  # type: ignore
+    class_name = display_name.split(":")[-1]
+    assert class_name == "Response"
+    assert response.status == status
+    assert response.message == message
+
+
+def test_build_failure_reply_fails():
+    "Ensures ValueError is raised if a Status Enum is not used"
+    with pytest.raises(ValueError) as ex:
+        response = build_failure_reply("not a status enum", "message")
+
+    assert "Error assigning status to response" in ex.value.args[0]
+>>>>>>> 5bdafc5f93fd56bf94ca5a7979a28f185c7c7ebf:tests/dragon_wlm/test_reply_building.py
diff --git a/tests/dragon_wlm/test_request_dispatcher.py b/tests/dragon_wlm/test_request_dispatcher.py
new file mode 100644
index 0000000000..8dc0f67a31
--- /dev/null
+++ b/tests/dragon_wlm/test_request_dispatcher.py
@@ -0,0 +1,237 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import gc
+import os
+import time
+import typing as t
+from queue import Empty
+
+import numpy as np
+import pytest
+
+pytest.importorskip("dragon")
+
+
+# isort: off
+import dragon
+
+from dragon.fli import FLInterface
+from dragon.data.ddict.ddict import DDict
+from dragon.managed_memory import MemoryAlloc
+
+import multiprocessing as mp
+
+import torch
+
+# isort: on
+
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.control.dragon_util import (
+    function_as_dragon_proc,
+)
+from smartsim._core.mli.infrastructure.control.request_dispatcher import (
+    RequestBatch,
+    RequestDispatcher,
+)
+from smartsim._core.mli.infrastructure.control.worker_manager import (
+    EnvironmentConfigLoader,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim.log import get_logger
+
+from .utils.msg_pump import mock_messages
+
+logger = get_logger(__name__)
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+try:
+    mp.set_start_method("dragon")
+except Exception:
+    pass
+
+
+@pytest.mark.skip("TODO: Fix issue unpickling messages")
+@pytest.mark.parametrize("num_iterations", [4])
+def test_request_dispatcher(
+    num_iterations: int,
+    the_storage: DDict,
+    test_dir: str,
+) -> None:
+    """Test the request dispatcher batching and queueing system
+
+    This also includes setting a queue to disposable, checking that it is no
+    longer referenced by the dispatcher.
+    """
+
+    to_worker_channel = create_local()
+    to_worker_fli = FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli)
+
+    backbone_fs = BackboneFeatureStore(the_storage, allow_reserved_writes=True)
+
+    # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
+    # or test environment may be unable to send messages w/queue
+    os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = to_worker_fli_comm_ch.descriptor
+    os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone_fs.descriptor
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    request_dispatcher = RequestDispatcher(
+        batch_timeout=1000,
+        batch_size=2,
+        config_loader=config_loader,
+        worker_type=TorchWorker,
+        mem_pool_size=2 * 1024**2,
+    )
+
+    worker_queue = config_loader.get_queue()
+    if worker_queue is None:
+        logger.warning(
+            "FLI input queue not loaded correctly from config_loader: "
+            f"{config_loader._queue_descriptor}"
+        )
+
+    request_dispatcher._on_start()
+
+    # put some messages into the work queue for the dispatcher to pickup
+    channels = []
+    processes = []
+    for i in range(num_iterations):
+        batch: t.Optional[RequestBatch] = None
+        mem_allocs = []
+        tensors = []
+
+        # NOTE: creating callbacks in test to avoid a local channel being torn
+        # down when mock_messages terms but before the final response message is sent
+
+        callback_channel = DragonCommChannel.from_local()
+        channels.append(callback_channel)
+
+        process = function_as_dragon_proc(
+            mock_messages,
+            [
+                worker_queue.descriptor,
+                backbone_fs.descriptor,
+                i,
+                callback_channel.descriptor,
+            ],
+            [],
+            [],
+        )
+        processes.append(process)
+        process.start()
+        assert process.returncode is None, "The message pump failed to start"
+
+        # give dragon some time to populate the message queues
+        for i in range(15):
+            try:
+                request_dispatcher._on_iteration()
+                batch = request_dispatcher.task_queue.get(timeout=1.0)
+                break
+            except Empty:
+                time.sleep(2)
+                logger.warning(f"Task queue is empty on iteration {i}")
+                continue
+            except Exception as exc:
+                logger.error(f"Task queue exception on iteration {i}")
+                raise exc
+
+        assert batch is not None
+        assert batch.has_valid_requests
+
+        model_key = batch.model_id.key
+
+        try:
+            transform_result = batch.inputs
+            for transformed, dims, dtype in zip(
+                transform_result.transformed,
+                transform_result.dims,
+                transform_result.dtypes,
+            ):
+                mem_alloc = MemoryAlloc.attach(transformed)
+                mem_allocs.append(mem_alloc)
+                itemsize = np.empty((1), dtype=dtype).itemsize
+                tensors.append(
+                    torch.from_numpy(
+                        np.frombuffer(
+                            mem_alloc.get_memview()[0 : np.prod(dims) * itemsize],
+                            dtype=dtype,
+                        ).reshape(dims)
+                    )
+                )
+
+            assert len(batch.requests) == 2
+            assert batch.model_id.key == model_key
+            assert model_key in request_dispatcher._queues
+            assert model_key in request_dispatcher._active_queues
+            assert len(request_dispatcher._queues[model_key]) == 1
+            assert request_dispatcher._queues[model_key][0].empty()
+            assert request_dispatcher._queues[model_key][0].model_id.key == model_key
+            assert len(tensors) == 1
+            assert tensors[0].shape == torch.Size([2, 2])
+
+            for tensor in tensors:
+                for sample_idx in range(tensor.shape[0]):
+                    tensor_in = tensor[sample_idx]
+                    tensor_out = (sample_idx + 1) * torch.ones(
+                        (2,), dtype=torch.float32
+                    )
+                    assert torch.equal(tensor_in, tensor_out)
+
+        except Exception as exc:
+            raise exc
+        finally:
+            for mem_alloc in mem_allocs:
+                mem_alloc.free()
+
+        request_dispatcher._active_queues[model_key].make_disposable()
+        assert request_dispatcher._active_queues[model_key].can_be_removed
+
+        request_dispatcher._on_iteration()
+
+        assert model_key not in request_dispatcher._active_queues
+        assert model_key not in request_dispatcher._queues
+
+    # Try to remove the dispatcher and free the memory
+    del request_dispatcher
+    gc.collect()
diff --git a/tests/dragon_wlm/test_torch_worker.py b/tests/dragon_wlm/test_torch_worker.py
new file mode 100644
index 0000000000..2a9e7d01bd
--- /dev/null
+++ b/tests/dragon_wlm/test_torch_worker.py
@@ -0,0 +1,221 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import typing as t
+
+import numpy as np
+import pytest
+import torch
+
+dragon = pytest.importorskip("dragon")
+import dragon.globalservices.pool as dragon_gs_pool
+from dragon.managed_memory import MemoryAlloc, MemoryPool
+from torch import nn
+from torch.nn import functional as F
+
+from smartsim._core.mli.infrastructure.storage.feature_store import ModelKey
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim._core.mli.infrastructure.worker.worker import (
+    ExecuteResult,
+    FetchInputResult,
+    FetchModelResult,
+    InferenceRequest,
+    LoadModelResult,
+    RequestBatch,
+    TransformInputResult,
+)
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+# simple MNIST in PyTorch
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x, y):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+torch_device = {"cpu": "cpu", "gpu": "cuda"}
+
+
+def get_batch() -> torch.Tensor:
+    return torch.rand(20, 1, 28, 28)
+
+
+def create_torch_model():
+    n = Net()
+    example_forward_input = get_batch()
+    module = torch.jit.trace(n, [example_forward_input, example_forward_input])
+    model_buffer = io.BytesIO()
+    torch.jit.save(module, model_buffer)
+    return model_buffer.getvalue()
+
+
+def get_request() -> InferenceRequest:
+
+    tensors = [get_batch() for _ in range(2)]
+    tensor_numpy = [tensor.numpy() for tensor in tensors]
+    serialized_tensors_descriptors = [
+        MessageHandler.build_tensor_descriptor("c", "float32", list(tensor.shape))
+        for tensor in tensors
+    ]
+
+    return InferenceRequest(
+        model_key=ModelKey(key="model", descriptor="xyz"),
+        callback=None,
+        raw_inputs=tensor_numpy,
+        input_keys=None,
+        input_meta=serialized_tensors_descriptors,
+        output_keys=None,
+        raw_model=create_torch_model(),
+        batch_size=0,
+    )
+
+
+def get_request_batch_from_request(
+    request: InferenceRequest, inputs: t.Optional[TransformInputResult] = None
+) -> RequestBatch:
+
+    return RequestBatch([request], inputs, request.model_key)
+
+
+sample_request: InferenceRequest = get_request()
+sample_request_batch: RequestBatch = get_request_batch_from_request(sample_request)
+worker = TorchWorker()
+
+
+def test_load_model(mlutils) -> None:
+    fetch_model_result = FetchModelResult(sample_request.raw_model)
+    load_model_result = worker.load_model(
+        sample_request_batch, fetch_model_result, mlutils.get_test_device().lower()
+    )
+
+    assert load_model_result.model(
+        get_batch().to(torch_device[mlutils.get_test_device().lower()]),
+        get_batch().to(torch_device[mlutils.get_test_device().lower()]),
+    ).shape == torch.Size((20, 10))
+
+
+def test_transform_input(mlutils) -> None:
+    fetch_input_result = FetchInputResult(
+        sample_request.raw_inputs, sample_request.input_meta
+    )
+
+    mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc)
+
+    transform_input_result = worker.transform_input(
+        sample_request_batch, [fetch_input_result], mem_pool
+    )
+
+    batch = get_batch().numpy()
+    assert transform_input_result.slices[0] == slice(0, batch.shape[0])
+
+    for tensor_index in range(2):
+        assert torch.Size(transform_input_result.dims[tensor_index]) == batch.shape
+        assert transform_input_result.dtypes[tensor_index] == str(batch.dtype)
+        mem_alloc = MemoryAlloc.attach(transform_input_result.transformed[tensor_index])
+        itemsize = batch.itemsize
+        tensor = torch.from_numpy(
+            np.frombuffer(
+                mem_alloc.get_memview()[
+                    0 : np.prod(transform_input_result.dims[tensor_index]) * itemsize
+                ],
+                dtype=transform_input_result.dtypes[tensor_index],
+            ).reshape(transform_input_result.dims[tensor_index])
+        )
+
+        assert torch.equal(
+            tensor, torch.from_numpy(sample_request.raw_inputs[tensor_index])
+        )
+
+    mem_pool.destroy()
+
+
+def test_execute(mlutils) -> None:
+    load_model_result = LoadModelResult(
+        Net().to(torch_device[mlutils.get_test_device().lower()])
+    )
+    fetch_input_result = FetchInputResult(
+        sample_request.raw_inputs, sample_request.input_meta
+    )
+
+    request_batch = get_request_batch_from_request(sample_request, fetch_input_result)
+
+    mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc)
+
+    transform_result = worker.transform_input(
+        request_batch, [fetch_input_result], mem_pool
+    )
+
+    execute_result = worker.execute(
+        request_batch,
+        load_model_result,
+        transform_result,
+        mlutils.get_test_device().lower(),
+    )
+
+    assert all(
+        result.shape == torch.Size((20, 10)) for result in execute_result.predictions
+    )
+
+    mem_pool.destroy()
+
+
+def test_transform_output(mlutils):
+    tensors = [torch.rand((20, 10)) for _ in range(2)]
+    execute_result = ExecuteResult(tensors, [slice(0, 20)])
+
+    transformed_output = worker.transform_output(sample_request_batch, execute_result)
+
+    assert transformed_output[0].outputs == [item.numpy().tobytes() for item in tensors]
+    assert transformed_output[0].shape == None
+    assert transformed_output[0].order == "c"
+    assert transformed_output[0].dtype == "float32"
diff --git a/tests/dragon_wlm/test_worker_manager.py b/tests/dragon_wlm/test_worker_manager.py
new file mode 100644
index 0000000000..20370bea7e
--- /dev/null
+++ b/tests/dragon_wlm/test_worker_manager.py
@@ -0,0 +1,313 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import logging
+import pathlib
+import time
+
+import pytest
+
+torch = pytest.importorskip("torch")
+dragon = pytest.importorskip("dragon")
+
+import multiprocessing as mp
+
+try:
+    mp.set_start_method("dragon")
+except Exception:
+    pass
+
+import os
+
+import torch.nn as nn
+from dragon import fli
+
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.control.worker_manager import (
+    EnvironmentConfigLoader,
+    WorkerManager,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+from .utils.channel import FileSystemCommChannel
+
+logger = get_logger(__name__)
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+class MiniModel(nn.Module):
+    """A torch model that can be executed by the default torch worker"""
+
+    def __init__(self):
+        """Initialize the model."""
+        super().__init__()
+
+        self._name = "mini-model"
+        self._net = torch.nn.Linear(2, 1)
+
+    def forward(self, input):
+        """Execute a forward pass."""
+        return self._net(input)
+
+    @property
+    def bytes(self) -> bytes:
+        """Retrieve the serialized model
+
+        :returns: The byte stream of the model file
+        """
+        buffer = io.BytesIO()
+        scripted = torch.jit.trace(self._net, self.get_batch())
+        torch.jit.save(scripted, buffer)
+        return buffer.getvalue()
+
+    @classmethod
+    def get_batch(cls) -> "torch.Tensor":
+        """Generate a single batch of data with the correct
+        shape for inference.
+
+        :returns: The batch as a torch tensor
+        """
+        return torch.randn((100, 2), dtype=torch.float32)
+
+
+def create_model(model_path: pathlib.Path) -> pathlib.Path:
+    """Create a simple torch model and persist to disk for
+    testing purposes.
+
+    :param model_path: The path to the torch model file
+    """
+    if not model_path.parent.exists():
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+
+    model_path.unlink(missing_ok=True)
+
+    mini_model = MiniModel()
+    torch.save(mini_model, model_path)
+
+    return model_path
+
+
+def load_model() -> bytes:
+    """Create a simple torch model in memory for testing."""
+    mini_model = MiniModel()
+    return mini_model.bytes
+
+
+def mock_messages(
+    feature_store_root_dir: pathlib.Path,
+    comm_channel_root_dir: pathlib.Path,
+    kill_queue: mp.Queue,
+) -> None:
+    """Mock event producer for triggering the inference pipeline.
+
+    :param feature_store_root_dir: Path to a directory where a
+    FileSystemFeatureStore can read & write results
+    :param comm_channel_root_dir: Path to a directory where a
+    FileSystemCommChannel can read & write messages
+    :param kill_queue: Queue used by unit test to stop mock_message process
+    """
+    feature_store_root_dir.mkdir(parents=True, exist_ok=True)
+    comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
+
+    iteration_number = 0
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    backbone = config_loader.get_backbone()
+
+    worker_queue = config_loader.get_queue()
+    if worker_queue is None:
+        queue_desc = config_loader._queue_descriptor
+        logger.warn(
+            f"FLI input queue not loaded correctly from config_loader: {queue_desc}"
+        )
+
+    model_key = "mini-model"
+    model_bytes = load_model()
+    backbone[model_key] = model_bytes
+
+    while True:
+        if not kill_queue.empty():
+            return
+        iteration_number += 1
+        time.sleep(1)
+
+        channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt"
+        callback_channel = FileSystemCommChannel(pathlib.Path(channel_key))
+
+        batch = MiniModel.get_batch()
+        shape = batch.shape
+        batch_bytes = batch.numpy().tobytes()
+
+        logger.debug(f"Model content: {backbone[model_key][:20]}")
+
+        input_descriptor = MessageHandler.build_tensor_descriptor(
+            "f", "float32", list(shape)
+        )
+
+        # The first request is always the metadata...
+        request = MessageHandler.build_request(
+            reply_channel=callback_channel.descriptor,
+            model=MessageHandler.build_model(model_bytes, "mini-model", "1.0"),
+            inputs=[input_descriptor],
+            outputs=[],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+        request_bytes = MessageHandler.serialize_request(request)
+        fli: DragonFLIChannel = worker_queue
+
+        multipart_message = [request_bytes, batch_bytes]
+        fli.send_multiple(multipart_message)
+
+        logger.info("published message")
+
+        if iteration_number > 5:
+            return
+
+
+def mock_mli_infrastructure_mgr() -> None:
+    """Create resources normally instanatiated by the infrastructure
+    management portion of the DragonBackend.
+    """
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    integrated_worker = TorchWorker
+
+    worker_manager = WorkerManager(
+        config_loader,
+        integrated_worker,
+        as_service=True,
+        cooldown=10,
+        device="cpu",
+        dispatcher_queue=mp.Queue(maxsize=0),
+    )
+    worker_manager.execute()
+
+
+@pytest.fixture
+def prepare_environment(test_dir: str) -> pathlib.Path:
+    """Cleanup prior outputs to run demo repeatedly.
+
+    :param test_dir: the directory to prepare
+    :returns: The path to the log file
+    """
+    path = pathlib.Path(f"{test_dir}/workermanager.log")
+    logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
+    return path
+
+
+def test_worker_manager(prepare_environment: pathlib.Path) -> None:
+    """Test the worker manager.
+
+    :param prepare_environment: Pass this fixture to configure
+    global resources before the worker manager executes
+    """
+
+    test_path = prepare_environment
+    fs_path = test_path / "feature_store"
+    comm_path = test_path / "comm_store"
+
+    mgr_per_node = 1
+    num_nodes = 2
+    mem_per_node = 128 * 1024**2
+
+    storage = create_ddict(num_nodes, mgr_per_node, mem_per_node)
+    backbone = BackboneFeatureStore(storage, allow_reserved_writes=True)
+
+    to_worker_channel = create_local()
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+
+    to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli)
+
+    # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader
+    # or test environment may be unable to send messages w/queue
+    os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = (
+        to_worker_fli_comm_channel.descriptor
+    )
+    os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    integrated_worker_type = TorchWorker
+
+    worker_manager = WorkerManager(
+        config_loader,
+        integrated_worker_type,
+        as_service=True,
+        cooldown=5,
+        device="cpu",
+        dispatcher_queue=mp.Queue(maxsize=0),
+    )
+
+    worker_queue = config_loader.get_queue()
+    if worker_queue is None:
+        logger.warn(
+            f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}"
+        )
+    backbone.worker_queue = to_worker_fli_comm_channel.descriptor
+
+    # create a mock client application to populate the request queue
+    kill_queue = mp.Queue()
+    msg_pump = mp.Process(
+        target=mock_messages,
+        args=(fs_path, comm_path, kill_queue),
+    )
+    msg_pump.start()
+
+    # create a process to execute commands
+    process = mp.Process(target=mock_mli_infrastructure_mgr)
+
+    # let it send some messages before starting the worker manager
+    msg_pump.join(timeout=5)
+    process.start()
+    msg_pump.join(timeout=5)
+    kill_queue.put_nowait("kill!")
+    process.join(timeout=5)
+    msg_pump.kill()
+    process.kill()
diff --git a/tests/dragon_wlm/utils/__init__.py b/tests/dragon_wlm/utils/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dragon_wlm/utils/channel.py b/tests/dragon_wlm/utils/channel.py
new file mode 100644
index 0000000000..4c46359c2d
--- /dev/null
+++ b/tests/dragon_wlm/utils/channel.py
@@ -0,0 +1,125 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import base64
+import pathlib
+import threading
+import typing as t
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class FileSystemCommChannel(CommChannelBase):
+    """Passes messages by writing to a file"""
+
+    def __init__(self, key: pathlib.Path) -> None:
+        """Initialize the FileSystemCommChannel instance.
+
+        :param key: a path to the root directory of the feature store
+        """
+        self._lock = threading.RLock()
+
+        super().__init__(key.as_posix())
+        self._file_path = key
+
+        if not self._file_path.parent.exists():
+            self._file_path.parent.mkdir(parents=True)
+
+        self._file_path.touch()
+
+    def send(self, value: bytes, timeout: float = 0) -> None:
+        """Send a message throuh the underlying communication channel.
+
+        :param value: The value to send
+        :param timeout: maximum time to wait (in seconds) for messages to send
+        """
+        with self._lock:
+            # write as text so we can add newlines as delimiters
+            with open(self._file_path, "a") as fp:
+                encoded_value = base64.b64encode(value).decode("utf-8")
+                fp.write(f"{encoded_value}\n")
+                logger.debug(f"FileSystemCommChannel {self._file_path} sent message")
+
+    def recv(self, timeout: float = 0) -> t.List[bytes]:
+        """Receives message(s) through the underlying communication channel.
+
+        :param timeout: maximum time to wait (in seconds) for messages to arrive
+        :returns: the received message
+        :raises SmartSimError: if the descriptor points to a missing file
+        """
+        with self._lock:
+            messages: t.List[bytes] = []
+            if not self._file_path.exists():
+                raise SmartSimError("Empty channel")
+
+            # read as text so we can split on newlines
+            with open(self._file_path, "r") as fp:
+                lines = fp.readlines()
+
+            if lines:
+                line = lines.pop(0)
+                event_bytes = base64.b64decode(line.encode("utf-8"))
+                messages.append(event_bytes)
+
+            self.clear()
+
+            # remove the first message only, write remainder back...
+            if len(lines) > 0:
+                with open(self._file_path, "w") as fp:
+                    fp.writelines(lines)
+
+                logger.debug(
+                    f"FileSystemCommChannel {self._file_path} received message"
+                )
+
+            return messages
+
+    def clear(self) -> None:
+        """Create an empty file for events."""
+        if self._file_path.exists():
+            self._file_path.unlink()
+        self._file_path.touch()
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "FileSystemCommChannel":
+        """A factory method that creates an instance from a descriptor string.
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemCommChannel
+        """
+        try:
+            path = pathlib.Path(descriptor)
+            return FileSystemCommChannel(path)
+        except:
+            logger.warning(f"failed to create fs comm channel: {descriptor}")
+            raise
diff --git a/tests/dragon_wlm/utils/msg_pump.py b/tests/dragon_wlm/utils/msg_pump.py
new file mode 100644
index 0000000000..8d69e57c63
--- /dev/null
+++ b/tests/dragon_wlm/utils/msg_pump.py
@@ -0,0 +1,225 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import logging
+import pathlib
+import sys
+import time
+import typing as t
+
+import pytest
+
+pytest.importorskip("torch")
+pytest.importorskip("dragon")
+
+
+# isort: off
+import dragon
+import multiprocessing as mp
+import torch
+import torch.nn as nn
+
+# isort: on
+
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+logger = get_logger(__name__, log_level=logging.DEBUG)
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+try:
+    mp.set_start_method("dragon")
+except Exception:
+    pass
+
+
+class MiniModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self._name = "mini-model"
+        self._net = torch.nn.Linear(2, 1)
+
+    def forward(self, input):
+        return self._net(input)
+
+    @property
+    def bytes(self) -> bytes:
+        """Returns the model serialized to a byte stream"""
+        buffer = io.BytesIO()
+        scripted = torch.jit.trace(self._net, self.get_batch())
+        torch.jit.save(scripted, buffer)
+        return buffer.getvalue()
+
+    @classmethod
+    def get_batch(cls) -> "torch.Tensor":
+        return torch.randn((100, 2), dtype=torch.float32)
+
+
+def load_model() -> bytes:
+    """Create a simple torch model in memory for testing"""
+    mini_model = MiniModel()
+    return mini_model.bytes
+
+
+def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
+    """Create a simple torch model and persist to disk for
+    testing purposes.
+
+    :returns: Path to the model file
+    """
+    # test_path = pathlib.Path(work_dir)
+    if not model_path.parent.exists():
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+
+    model_path.unlink(missing_ok=True)
+
+    model = torch.nn.Linear(2, 1)
+    torch.save(model, model_path)
+
+    return model_path
+
+
+def _mock_messages(
+    dispatch_fli_descriptor: str,
+    fs_descriptor: str,
+    parent_iteration: int,
+    callback_descriptor: str,
+) -> None:
+    """Mock event producer for triggering the inference pipeline."""
+    model_key = "mini-model"
+    # mock_message sends 2 messages, so we offset by 2 * (# of iterations in caller)
+    offset = 2 * parent_iteration
+
+    feature_store = BackboneFeatureStore.from_descriptor(fs_descriptor)
+    request_dispatcher_queue = DragonFLIChannel.from_descriptor(dispatch_fli_descriptor)
+
+    feature_store[model_key] = load_model()
+
+    for iteration_number in range(2):
+        logged_iteration = offset + iteration_number
+        logger.debug(f"Sending mock message {logged_iteration}")
+
+        output_key = f"output-{iteration_number}"
+
+        tensor = (
+            (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32)
+        ).numpy()
+        fsd = feature_store.descriptor
+
+        tensor_desc = MessageHandler.build_tensor_descriptor(
+            "c", "float32", list(tensor.shape)
+        )
+
+        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
+        message_model_key = MessageHandler.build_model_key(model_key, fsd)
+
+        request = MessageHandler.build_request(
+            reply_channel=callback_descriptor,
+            model=message_model_key,
+            inputs=[tensor_desc],
+            outputs=[message_tensor_output_key],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+
+        logger.info(f"Sending request {iteration_number} to request_dispatcher_queue")
+        request_bytes = MessageHandler.serialize_request(request)
+
+        logger.info("Sending msg_envelope")
+
+        # cuid = request_dispatcher_queue._channel.cuid
+        # logger.info(f"\tInternal cuid: {cuid}")
+
+        # send the header & body together so they arrive together
+        try:
+            request_dispatcher_queue.send_multiple([request_bytes, tensor.tobytes()])
+            logger.info(f"\tenvelope 0: {request_bytes[:5]}...")
+            logger.info(f"\tenvelope 1: {tensor.tobytes()[:5]}...")
+        except Exception as ex:
+            logger.exception("Unable to send request envelope")
+
+    logger.info("All messages sent")
+
+    # keep the process alive for an extra 15 seconds to let the processor
+    # have access to the channels before they're destroyed
+    for _ in range(15):
+        time.sleep(1)
+
+
+def mock_messages(
+    dispatch_fli_descriptor: str,
+    fs_descriptor: str,
+    parent_iteration: int,
+    callback_descriptor: str,
+) -> int:
+    """Mock event producer for triggering the inference pipeline. Used
+    when starting using multiprocessing."""
+    logger.info(f"{dispatch_fli_descriptor=}")
+    logger.info(f"{fs_descriptor=}")
+    logger.info(f"{parent_iteration=}")
+    logger.info(f"{callback_descriptor=}")
+
+    try:
+        return _mock_messages(
+            dispatch_fli_descriptor,
+            fs_descriptor,
+            parent_iteration,
+            callback_descriptor,
+        )
+    except Exception as ex:
+        logger.exception()
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    import argparse
+
+    args = argparse.ArgumentParser()
+
+    args.add_argument("--dispatch-fli-descriptor", type=str)
+    args.add_argument("--fs-descriptor", type=str)
+    args.add_argument("--parent-iteration", type=int)
+    args.add_argument("--callback-descriptor", type=str)
+
+    args = args.parse_args()
+
+    return_code = mock_messages(
+        args.dispatch_fli_descriptor,
+        args.fs_descriptor,
+        args.parent_iteration,
+        args.callback_descriptor,
+    )
+    sys.exit(return_code)
diff --git a/tests/dragon_wlm/utils/worker.py b/tests/dragon_wlm/utils/worker.py
new file mode 100644
index 0000000000..0582cae566
--- /dev/null
+++ b/tests/dragon_wlm/utils/worker.py
@@ -0,0 +1,104 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import typing as t
+
+import torch
+
+import smartsim._core.mli.infrastructure.worker.worker as mliw
+import smartsim.error as sse
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class IntegratedTorchWorker(mliw.MachineLearningWorkerBase):
+    """A minimum implementation of a worker that executes a PyTorch model"""
+
+    # @staticmethod
+    # def deserialize(request: InferenceRequest) -> t.List[t.Any]:
+    #     # request.input_meta
+    #     # request.raw_inputs
+    #     return request
+
+    @staticmethod
+    def load_model(
+        request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult, device: str
+    ) -> mliw.LoadModelResult:
+        model_bytes = fetch_result.model_bytes or request.raw_model
+        if not model_bytes:
+            raise ValueError("Unable to load model without reference object")
+
+        model: torch.nn.Module = torch.load(io.BytesIO(model_bytes))
+        result = mliw.LoadModelResult(model)
+        return result
+
+    @staticmethod
+    def transform_input(
+        request: mliw.InferenceRequest,
+        fetch_result: mliw.FetchInputResult,
+        device: str,
+    ) -> mliw.TransformInputResult:
+        # extra metadata for assembly can be found in request.input_meta
+        raw_inputs = request.raw_inputs or fetch_result.inputs
+
+        result: t.List[torch.Tensor] = []
+        # should this happen here?
+        # consider - fortran to c data layout
+        # is there an intermediate representation before really doing torch.load?
+        if raw_inputs:
+            result = [torch.load(io.BytesIO(item)) for item in raw_inputs]
+
+        return mliw.TransformInputResult(result)
+
+    @staticmethod
+    def execute(
+        request: mliw.InferenceRequest,
+        load_result: mliw.LoadModelResult,
+        transform_result: mliw.TransformInputResult,
+    ) -> mliw.ExecuteResult:
+        if not load_result.model:
+            raise sse.SmartSimError("Model must be loaded to execute")
+
+        model = load_result.model
+        results = [model(tensor) for tensor in transform_result.transformed]
+
+        execute_result = mliw.ExecuteResult(results)
+        return execute_result
+
+    @staticmethod
+    def transform_output(
+        request: mliw.InferenceRequest,
+        execute_result: mliw.ExecuteResult,
+        result_device: str,
+    ) -> mliw.TransformOutputResult:
+        # send the original tensors...
+        execute_result.predictions = [t.detach() for t in execute_result.predictions]
+        # todo: solve sending all tensor metadata that coincisdes with each prediction
+        return mliw.TransformOutputResult(
+            execute_result.predictions, [1], "c", "float32"
+        )
diff --git a/tests/mli/__init__.py b/tests/mli/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/mli/channel.py b/tests/mli/channel.py
new file mode 100644
index 0000000000..4c46359c2d
--- /dev/null
+++ b/tests/mli/channel.py
@@ -0,0 +1,125 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import base64
+import pathlib
+import threading
+import typing as t
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class FileSystemCommChannel(CommChannelBase):
+    """Passes messages by writing to a file"""
+
+    def __init__(self, key: pathlib.Path) -> None:
+        """Initialize the FileSystemCommChannel instance.
+
+        :param key: a path to the root directory of the feature store
+        """
+        self._lock = threading.RLock()
+
+        super().__init__(key.as_posix())
+        self._file_path = key
+
+        if not self._file_path.parent.exists():
+            self._file_path.parent.mkdir(parents=True)
+
+        self._file_path.touch()
+
+    def send(self, value: bytes, timeout: float = 0) -> None:
+        """Send a message throuh the underlying communication channel.
+
+        :param value: The value to send
+        :param timeout: maximum time to wait (in seconds) for messages to send
+        """
+        with self._lock:
+            # write as text so we can add newlines as delimiters
+            with open(self._file_path, "a") as fp:
+                encoded_value = base64.b64encode(value).decode("utf-8")
+                fp.write(f"{encoded_value}\n")
+                logger.debug(f"FileSystemCommChannel {self._file_path} sent message")
+
+    def recv(self, timeout: float = 0) -> t.List[bytes]:
+        """Receives message(s) through the underlying communication channel.
+
+        :param timeout: maximum time to wait (in seconds) for messages to arrive
+        :returns: the received message
+        :raises SmartSimError: if the descriptor points to a missing file
+        """
+        with self._lock:
+            messages: t.List[bytes] = []
+            if not self._file_path.exists():
+                raise SmartSimError("Empty channel")
+
+            # read as text so we can split on newlines
+            with open(self._file_path, "r") as fp:
+                lines = fp.readlines()
+
+            if lines:
+                line = lines.pop(0)
+                event_bytes = base64.b64decode(line.encode("utf-8"))
+                messages.append(event_bytes)
+
+            self.clear()
+
+            # remove the first message only, write remainder back...
+            if len(lines) > 0:
+                with open(self._file_path, "w") as fp:
+                    fp.writelines(lines)
+
+                logger.debug(
+                    f"FileSystemCommChannel {self._file_path} received message"
+                )
+
+            return messages
+
+    def clear(self) -> None:
+        """Create an empty file for events."""
+        if self._file_path.exists():
+            self._file_path.unlink()
+        self._file_path.touch()
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "FileSystemCommChannel":
+        """A factory method that creates an instance from a descriptor string.
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemCommChannel
+        """
+        try:
+            path = pathlib.Path(descriptor)
+            return FileSystemCommChannel(path)
+        except:
+            logger.warning(f"failed to create fs comm channel: {descriptor}")
+            raise
diff --git a/tests/mli/feature_store.py b/tests/mli/feature_store.py
new file mode 100644
index 0000000000..7bc18253c8
--- /dev/null
+++ b/tests/mli/feature_store.py
@@ -0,0 +1,144 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+import typing as t
+
+import smartsim.error as sse
+from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class MemoryFeatureStore(FeatureStore):
+    """A feature store with values persisted only in local memory"""
+
+    def __init__(self, storage: t.Optional[t.Dict[str, bytes]] = None) -> None:
+        """Initialize the MemoryFeatureStore instance"""
+        super().__init__("in-memory-fs")
+        if storage is None:
+            storage = {"_": "abc"}
+        self._storage: t.Dict[str, bytes] = storage
+
+    def _get(self, key: str) -> bytes:
+        """Retrieve an item using key
+
+        :param key: Unique key of an item to retrieve from the feature store"""
+        if key not in self._storage:
+            raise sse.SmartSimError(f"{key} not found in feature store")
+        return self._storage[key]
+
+    def _set(self, key: str, value: bytes) -> None:
+        """Membership operator to test for a key existing within the feature store.
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
+        self._check_reserved(key)
+        self._storage[key] = value
+
+    def _contains(self, key: str) -> bool:
+        """Membership operator to test for a key existing within the feature store.
+        Return `True` if the key is found, `False` otherwise
+        :param key: Unique key of an item to retrieve from the feature store"""
+        return key in self._storage
+
+
+class FileSystemFeatureStore(FeatureStore):
+    """Alternative feature store implementation for testing. Stores all
+    data on the file system"""
+
+    def __init__(self, storage_dir: t.Union[pathlib.Path, str] = None) -> None:
+        """Initialize the FileSystemFeatureStore instance
+
+        :param storage_dir: (optional) root directory to store all data relative to"""
+        if isinstance(storage_dir, str):
+            storage_dir = pathlib.Path(storage_dir)
+        self._storage_dir = storage_dir
+        super().__init__(storage_dir.as_posix())
+
+    def _get(self, key: str) -> bytes:
+        """Retrieve an item using key
+
+        :param key: Unique key of an item to retrieve from the feature store"""
+        path = self._key_path(key)
+        if not path.exists():
+            raise sse.SmartSimError(f"{path} not found in feature store")
+        return path.read_bytes()
+
+    def _set(self, key: str, value: bytes) -> None:
+        """Assign a value using key
+
+        :param key: Unique key of an item to set in the feature store
+        :param value: Value to persist in the feature store"""
+        path = self._key_path(key, create=True)
+        if isinstance(value, str):
+            value = value.encode("utf-8")
+        path.write_bytes(value)
+
+    def _contains(self, key: str) -> bool:
+        """Membership operator to test for a key existing within the feature store.
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
+        path = self._key_path(key)
+        return path.exists()
+
+    def _key_path(self, key: str, create: bool = False) -> pathlib.Path:
+        """Given a key, return a path that is optionally combined with a base
+        directory used by the FileSystemFeatureStore.
+
+        :param key: Unique key of an item to retrieve from the feature store"""
+        value = pathlib.Path(key)
+
+        if self._storage_dir:
+            value = self._storage_dir / key
+
+        if create:
+            value.parent.mkdir(parents=True, exist_ok=True)
+
+        return value
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "FileSystemFeatureStore":
+        """A factory method that creates an instance from a descriptor string
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemFeatureStore"""
+        try:
+            path = pathlib.Path(descriptor)
+            path.mkdir(parents=True, exist_ok=True)
+            if not path.is_dir():
+                raise ValueError("FileSystemFeatureStore requires a directory path")
+            if not path.exists():
+                path.mkdir(parents=True, exist_ok=True)
+            return FileSystemFeatureStore(path)
+        except:
+            logger.error(f"Error while creating FileSystemFeatureStore: {descriptor}")
+            raise
diff --git a/tests/mli/test_integrated_torch_worker.py b/tests/mli/test_integrated_torch_worker.py
new file mode 100644
index 0000000000..4d93358bfb
--- /dev/null
+++ b/tests/mli/test_integrated_torch_worker.py
@@ -0,0 +1,271 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+
+import pytest
+import torch
+
+# The tests in this file belong to the group_b group
+pytestmark = pytest.mark.group_b
+
+# retrieved from pytest fixtures
+is_dragon = pytest.test_launcher == "dragon"
+torch_available = (
+    "torch" in []
+)  # todo: update test to replace installed_redisai_backends()
+
+
+@pytest.fixture
+def persist_torch_model(test_dir: str) -> pathlib.Path:
+    test_path = pathlib.Path(test_dir)
+    model_path = test_path / "basic.pt"
+
+    model = torch.nn.Linear(2, 1)
+    torch.save(model, model_path)
+
+    return model_path
+
+
+# todo: move deserialization tests into suite for worker manager where serialization occurs
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_deserialize_direct_request(persist_torch_model: pathlib.Path) -> None:
+#     """Verify that a direct requestis deserialized properly"""
+#     worker = mli.IntegratedTorchWorker
+#     # feature_store = mli.MemoryFeatureStore()
+
+#     model_bytes = persist_torch_model.read_bytes()
+#     input_tensor = torch.randn(2)
+
+#     expected_callback_channel = b"faux_channel_descriptor_bytes"
+#     callback_channel = mli.DragonCommChannel.find(expected_callback_channel)
+
+#     message_tensor_input = MessageHandler.build_tensor(
+#         input_tensor, "c", "float32", [2]
+#     )
+
+#     request = MessageHandler.build_request(
+#         reply_channel=callback_channel.descriptor,
+#         model=model_bytes,
+#         inputs=[message_tensor_input],
+#         outputs=[],
+#         custom_attributes=None,
+#     )
+
+#     msg_bytes = MessageHandler.serialize_request(request)
+
+#     inference_request = worker.deserialize(msg_bytes)
+#     assert inference_request.callback._descriptor == expected_callback_channel
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_deserialize_indirect_request(persist_torch_model: pathlib.Path) -> None:
+#     """Verify that an indirect request is deserialized correctly"""
+#     worker = mli.IntegratedTorchWorker
+#     # feature_store = mli.MemoryFeatureStore()
+
+#     model_key = "persisted-model"
+#     # model_bytes = persist_torch_model.read_bytes()
+#     # feature_store[model_key] = model_bytes
+
+#     input_key = f"demo-input"
+#     # input_tensor = torch.randn(2)
+#     # feature_store[input_key] = input_tensor
+
+#     expected_callback_channel = b"faux_channel_descriptor_bytes"
+#     callback_channel = mli.DragonCommChannel.find(expected_callback_channel)
+
+#     output_key = f"demo-output"
+
+#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     message_model_key = MessageHandler.build_model_key(model_key)
+
+#     request = MessageHandler.build_request(
+#         reply_channel=callback_channel.descriptor,
+#         model=message_model_key,
+#         inputs=[message_tensor_input_key],
+#         outputs=[message_tensor_output_key],
+#         custom_attributes=None,
+#     )
+
+#     msg_bytes = MessageHandler.serialize_request(request)
+
+#     inference_request = worker.deserialize(msg_bytes)
+#     assert inference_request.callback._descriptor == expected_callback_channel
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_deserialize_mixed_mode_indirect_inputs(
+#     persist_torch_model: pathlib.Path,
+# ) -> None:
+#     """Verify that a mixed mode (combining direct and indirect inputs, models, outputs)
+#     with indirect inputs is deserialized correctly"""
+#     worker = mli.IntegratedTorchWorker
+#     # feature_store = mli.MemoryFeatureStore()
+
+#     # model_key = "persisted-model"
+#     model_bytes = persist_torch_model.read_bytes()
+#     # feature_store[model_key] = model_bytes
+
+#     input_key = f"demo-input"
+#     # input_tensor = torch.randn(2)
+#     # feature_store[input_key] = input_tensor
+
+#     expected_callback_channel = b"faux_channel_descriptor_bytes"
+#     callback_channel = mli.DragonCommChannel.find(expected_callback_channel)
+
+#     output_key = f"demo-output"
+
+#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     # message_model_key = MessageHandler.build_model_key(model_key)
+
+#     request = MessageHandler.build_request(
+#         reply_channel=callback_channel.descriptor,
+#         model=model_bytes,
+#         inputs=[message_tensor_input_key],
+#         # outputs=[message_tensor_output_key],
+#         outputs=[],
+#         custom_attributes=None,
+#     )
+
+#     msg_bytes = MessageHandler.serialize_request(request)
+
+#     inference_request = worker.deserialize(msg_bytes)
+#     assert inference_request.callback._descriptor == expected_callback_channel
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_deserialize_mixed_mode_indirect_outputs(
+#     persist_torch_model: pathlib.Path,
+# ) -> None:
+#     """Verify that a mixed mode (combining direct and indirect inputs, models, outputs)
+#     with indirect outputs is deserialized correctly"""
+#     worker = mli.IntegratedTorchWorker
+#     # feature_store = mli.MemoryFeatureStore()
+
+#     # model_key = "persisted-model"
+#     model_bytes = persist_torch_model.read_bytes()
+#     # feature_store[model_key] = model_bytes
+
+#     input_key = f"demo-input"
+#     input_tensor = torch.randn(2)
+#     # feature_store[input_key] = input_tensor
+
+#     expected_callback_channel = b"faux_channel_descriptor_bytes"
+#     callback_channel = mli.DragonCommChannel.find(expected_callback_channel)
+
+#     output_key = f"demo-output"
+
+#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     # message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     # message_model_key = MessageHandler.build_model_key(model_key)
+#     message_tensor_input = MessageHandler.build_tensor(
+#         input_tensor, "c", "float32", [2]
+#     )
+
+#     request = MessageHandler.build_request(
+#         reply_channel=callback_channel.descriptor,
+#         model=model_bytes,
+#         inputs=[message_tensor_input],
+#         # outputs=[message_tensor_output_key],
+#         outputs=[message_tensor_output_key],
+#         custom_attributes=None,
+#     )
+
+#     msg_bytes = MessageHandler.serialize_request(request)
+
+#     inference_request = worker.deserialize(msg_bytes)
+#     assert inference_request.callback._descriptor == expected_callback_channel
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_deserialize_mixed_mode_indirect_model(
+#     persist_torch_model: pathlib.Path,
+# ) -> None:
+#     """Verify that a mixed mode (combining direct and indirect inputs, models, outputs)
+#     with indirect outputs is deserialized correctly"""
+#     worker = mli.IntegratedTorchWorker
+#     # feature_store = mli.MemoryFeatureStore()
+
+#     model_key = "persisted-model"
+#     # model_bytes = persist_torch_model.read_bytes()
+#     # feature_store[model_key] = model_bytes
+
+#     # input_key = f"demo-input"
+#     input_tensor = torch.randn(2)
+#     # feature_store[input_key] = input_tensor
+
+#     expected_callback_channel = b"faux_channel_descriptor_bytes"
+#     callback_channel = mli.DragonCommChannel.find(expected_callback_channel)
+
+#     output_key = f"demo-output"
+
+#     # message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     # message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     message_model_key = MessageHandler.build_model_key(model_key)
+#     message_tensor_input = MessageHandler.build_tensor(
+#         input_tensor, "c", "float32", [2]
+#     )
+
+#     request = MessageHandler.build_request(
+#         reply_channel=callback_channel.descriptor,
+#         model=message_model_key,
+#         inputs=[message_tensor_input],
+#         # outputs=[message_tensor_output_key],
+#         outputs=[],
+#         custom_attributes=None,
+#     )
+
+#     msg_bytes = MessageHandler.serialize_request(request)
+
+#     inference_request = worker.deserialize(msg_bytes)
+#     assert inference_request.callback._descriptor == expected_callback_channel
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_serialize(test_dir: str, persist_torch_model: pathlib.Path) -> None:
+#     """Verify that the worker correctly executes reply serialization"""
+#     worker = mli.IntegratedTorchWorker
+
+#     reply = mli.InferenceReply()
+#     reply.output_keys = ["foo", "bar"]
+
+#     # use the worker implementation of reply serialization to get bytes for
+#     # use on the callback channel
+#     reply_bytes = worker.serialize_reply(reply)
+#     assert reply_bytes is not None
+
+#     # deserialize to verity the mapping in the worker.serialize_reply was correct
+#     actual_reply = MessageHandler.deserialize_response(reply_bytes)
+
+#     actual_tensor_keys = [tk.key for tk in actual_reply.result.keys]
+#     assert set(actual_tensor_keys) == set(reply.output_keys)
+#     assert actual_reply.status == 200
+#     assert actual_reply.statusMessage == "success"
diff --git a/tests/mli/test_service.py b/tests/mli/test_service.py
new file mode 100644
index 0000000000..41595ca80b
--- /dev/null
+++ b/tests/mli/test_service.py
@@ -0,0 +1,290 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import datetime
+import multiprocessing as mp
+import pathlib
+import time
+import typing as t
+from asyncore import loop
+
+import pytest
+import torch
+
+import smartsim.error as sse
+from smartsim._core.entrypoints.service import Service
+
+# The tests in this file belong to the group_b group
+pytestmark = pytest.mark.group_a
+
+
+class SimpleService(Service):
+    """Mock implementation of a service that counts method invocations
+    using the base class event hooks."""
+
+    def __init__(
+        self,
+        log: t.List[str],
+        quit_after: int = -1,
+        as_service: bool = False,
+        cooldown: float = 0,
+        loop_delay: float = 0,
+        hc_freq: float = -1,
+        run_for: float = 0,
+    ) -> None:
+        super().__init__(as_service, cooldown, loop_delay, hc_freq)
+        self._log = log
+        self._quit_after = quit_after
+        self.num_starts = 0
+        self.num_shutdowns = 0
+        self.num_health_checks = 0
+        self.num_cooldowns = 0
+        self.num_delays = 0
+        self.num_iterations = 0
+        self.num_can_shutdown = 0
+        self.run_for = run_for
+        self.start_time = time.time()
+
+    @property
+    def runtime(self) -> float:
+        return time.time() - self.start_time
+
+    def _can_shutdown(self) -> bool:
+        self.num_can_shutdown += 1
+
+        if self._quit_after > -1 and self.num_iterations >= self._quit_after:
+            return True
+        if self.run_for > 0:
+            return self.runtime >= self.run_for
+
+    def _on_start(self) -> None:
+        self.num_starts += 1
+
+    def _on_shutdown(self) -> None:
+        self.num_shutdowns += 1
+
+    def _on_health_check(self) -> None:
+        self.num_health_checks += 1
+
+    def _on_cooldown_elapsed(self) -> None:
+        self.num_cooldowns += 1
+
+    def _on_delay(self) -> None:
+        self.num_delays += 1
+
+    def _on_iteration(self) -> None:
+        self.num_iterations += 1
+
+        return self.num_iterations >= self._quit_after
+
+
+def test_service_init() -> None:
+    """Verify expected default values after Service initialization"""
+    activity_log: t.List[str] = []
+    service = SimpleService(activity_log)
+
+    assert service._as_service is False
+    assert service._cooldown == 0
+    assert service._loop_delay == 0
+
+
+def test_service_run_once() -> None:
+    """Verify the service completes after a single call to _on_iteration"""
+    activity_log: t.List[str] = []
+    service = SimpleService(activity_log)
+
+    service.execute()
+
+    assert service.num_iterations == 1
+    assert service.num_starts == 1
+    assert service.num_cooldowns == 0  # it never exceeds a cooldown period
+    assert service.num_can_shutdown == 0  # it automatically exits in run once
+    assert service.num_shutdowns == 1
+
+
+@pytest.mark.parametrize(
+    "num_iterations",
+    [
+        pytest.param(0, id="Immediate Shutdown"),
+        pytest.param(1, id="1x"),
+        pytest.param(2, id="2x"),
+        pytest.param(4, id="4x"),
+        pytest.param(8, id="8x"),
+        pytest.param(16, id="16x"),
+        pytest.param(32, id="32x"),
+    ],
+)
+def test_service_run_until_can_shutdown(num_iterations: int) -> None:
+    """Verify the service completes after a dynamic number of iterations
+    based on the return value of `_can_shutdown`"""
+    activity_log: t.List[str] = []
+
+    service = SimpleService(activity_log, quit_after=num_iterations, as_service=True)
+
+    service.execute()
+
+    if num_iterations == 0:
+        # no matter what, it should always execute the _on_iteration method
+        assert service.num_iterations == 1
+    else:
+        # the shutdown check follows on_iteration. there will be one last call
+        assert service.num_iterations == num_iterations
+
+    assert service.num_starts == 1
+    assert service.num_shutdowns == 1
+
+
+@pytest.mark.parametrize(
+    "cooldown",
+    [
+        pytest.param(1, id="1s"),
+        pytest.param(3, id="3s"),
+        pytest.param(5, id="5s"),
+    ],
+)
+def test_service_cooldown(cooldown: int) -> None:
+    """Verify that the cooldown period is respected"""
+    activity_log: t.List[str] = []
+
+    service = SimpleService(
+        activity_log,
+        quit_after=1,
+        as_service=True,
+        cooldown=cooldown,
+        loop_delay=0,
+    )
+
+    ts0 = datetime.datetime.now()
+    service.execute()
+    ts1 = datetime.datetime.now()
+
+    fudge_factor = 1.1  # allow a little bit of wiggle room for the loop
+    duration_in_seconds = (ts1 - ts0).total_seconds()
+
+    assert duration_in_seconds <= cooldown * fudge_factor
+    assert service.num_cooldowns == 1
+    assert service.num_shutdowns == 1
+
+
+@pytest.mark.parametrize(
+    "delay, num_iterations",
+    [
+        pytest.param(1, 3, id="1s delay, 3x"),
+        pytest.param(3, 2, id="2s delay, 2x"),
+        pytest.param(5, 1, id="5s delay, 1x"),
+    ],
+)
+def test_service_delay(delay: int, num_iterations: int) -> None:
+    """Verify that a delay is correctly added between iterations"""
+    activity_log: t.List[str] = []
+
+    service = SimpleService(
+        activity_log,
+        quit_after=num_iterations,
+        as_service=True,
+        cooldown=0,
+        loop_delay=delay,
+    )
+
+    ts0 = datetime.datetime.now()
+    service.execute()
+    ts1 = datetime.datetime.now()
+
+    # the expected duration is the sum of the delay between each iteration
+    expected_duration = (num_iterations + 1) * delay
+    duration_in_seconds = (ts1 - ts0).total_seconds()
+
+    assert duration_in_seconds <= expected_duration
+    assert service.num_cooldowns == 0
+    assert service.num_shutdowns == 1
+
+
+@pytest.mark.parametrize(
+    "health_check_freq, run_for",
+    [
+        pytest.param(1, 5.5, id="1s freq, 10x"),
+        pytest.param(5, 10.5, id="5s freq, 2x"),
+        pytest.param(0.1, 5.1, id="0.1s freq, 50x"),
+    ],
+)
+def test_service_health_check_freq(health_check_freq: float, run_for: float) -> None:
+    """Verify that a the health check frequency is honored
+
+    :param health_check_freq: The desired frequency of the health check
+    :pram run_for: A fixed duration to allow the service to run
+    """
+    activity_log: t.List[str] = []
+
+    service = SimpleService(
+        activity_log,
+        quit_after=-1,
+        as_service=True,
+        cooldown=0,
+        hc_freq=health_check_freq,
+        run_for=run_for,
+    )
+
+    ts0 = datetime.datetime.now()
+    service.execute()
+    ts1 = datetime.datetime.now()
+
+    # the expected duration is the sum of the delay between each iteration
+    expected_hc_count = run_for // health_check_freq
+
+    # allow some wiggle room for frequency comparison
+    assert expected_hc_count - 2 <= service.num_health_checks <= expected_hc_count + 2
+
+    assert service.num_cooldowns == 0
+    assert service.num_shutdowns == 1
+
+
+def test_service_health_check_freq_unbound() -> None:
+    """Verify that a health check frequency of zero is treated as
+    "always on" and is called each loop iteration
+
+    :param health_check_freq: The desired frequency of the health check
+    :pram run_for: A fixed duration to allow the service to run
+    """
+    health_check_freq: float = 0.0
+    run_for: float = 5
+
+    activity_log: t.List[str] = []
+
+    service = SimpleService(
+        activity_log,
+        quit_after=-1,
+        as_service=True,
+        cooldown=0,
+        hc_freq=health_check_freq,
+        run_for=run_for,
+    )
+
+    service.execute()
+
+    # allow some wiggle room for frequency comparison
+    assert service.num_health_checks == service.num_iterations
+    assert service.num_cooldowns == 0
+    assert service.num_shutdowns == 1
diff --git a/tests/mli/worker.py b/tests/mli/worker.py
new file mode 100644
index 0000000000..0582cae566
--- /dev/null
+++ b/tests/mli/worker.py
@@ -0,0 +1,104 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import typing as t
+
+import torch
+
+import smartsim._core.mli.infrastructure.worker.worker as mliw
+import smartsim.error as sse
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class IntegratedTorchWorker(mliw.MachineLearningWorkerBase):
+    """A minimum implementation of a worker that executes a PyTorch model"""
+
+    # @staticmethod
+    # def deserialize(request: InferenceRequest) -> t.List[t.Any]:
+    #     # request.input_meta
+    #     # request.raw_inputs
+    #     return request
+
+    @staticmethod
+    def load_model(
+        request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult, device: str
+    ) -> mliw.LoadModelResult:
+        model_bytes = fetch_result.model_bytes or request.raw_model
+        if not model_bytes:
+            raise ValueError("Unable to load model without reference object")
+
+        model: torch.nn.Module = torch.load(io.BytesIO(model_bytes))
+        result = mliw.LoadModelResult(model)
+        return result
+
+    @staticmethod
+    def transform_input(
+        request: mliw.InferenceRequest,
+        fetch_result: mliw.FetchInputResult,
+        device: str,
+    ) -> mliw.TransformInputResult:
+        # extra metadata for assembly can be found in request.input_meta
+        raw_inputs = request.raw_inputs or fetch_result.inputs
+
+        result: t.List[torch.Tensor] = []
+        # should this happen here?
+        # consider - fortran to c data layout
+        # is there an intermediate representation before really doing torch.load?
+        if raw_inputs:
+            result = [torch.load(io.BytesIO(item)) for item in raw_inputs]
+
+        return mliw.TransformInputResult(result)
+
+    @staticmethod
+    def execute(
+        request: mliw.InferenceRequest,
+        load_result: mliw.LoadModelResult,
+        transform_result: mliw.TransformInputResult,
+    ) -> mliw.ExecuteResult:
+        if not load_result.model:
+            raise sse.SmartSimError("Model must be loaded to execute")
+
+        model = load_result.model
+        results = [model(tensor) for tensor in transform_result.transformed]
+
+        execute_result = mliw.ExecuteResult(results)
+        return execute_result
+
+    @staticmethod
+    def transform_output(
+        request: mliw.InferenceRequest,
+        execute_result: mliw.ExecuteResult,
+        result_device: str,
+    ) -> mliw.TransformOutputResult:
+        # send the original tensors...
+        execute_result.predictions = [t.detach() for t in execute_result.predictions]
+        # todo: solve sending all tensor metadata that coincisdes with each prediction
+        return mliw.TransformOutputResult(
+            execute_result.predictions, [1], "c", "float32"
+        )
diff --git a/tests/test_dragon_comm_utils.py b/tests/test_dragon_comm_utils.py
new file mode 100644
index 0000000000..a6f9c206a4
--- /dev/null
+++ b/tests/test_dragon_comm_utils.py
@@ -0,0 +1,257 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import base64
+import pathlib
+import uuid
+
+import pytest
+
+from smartsim.error.errors import SmartSimError
+
+dragon = pytest.importorskip("dragon")
+
+# isort: off
+import dragon.channels as dch
+import dragon.infrastructure.parameters as dp
+import dragon.managed_memory as dm
+import dragon.fli as fli
+
+# isort: on
+
+from smartsim._core.mli.comm.channel import dragon_util
+from smartsim.log import get_logger
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+logger = get_logger(__name__)
+
+
+@pytest.fixture(scope="function")
+def the_pool() -> dm.MemoryPool:
+    """Creates a memory pool."""
+    raw_pool_descriptor = dp.this_process.default_pd
+    descriptor_ = base64.b64decode(raw_pool_descriptor)
+
+    pool = dm.MemoryPool.attach(descriptor_)
+    return pool
+
+
+@pytest.fixture(scope="function")
+def the_channel() -> dch.Channel:
+    """Creates a Channel attached to the local memory pool."""
+    channel = dch.Channel.make_process_local()
+    return channel
+
+
+@pytest.fixture(scope="function")
+def the_fli(the_channel) -> fli.FLInterface:
+    """Creates an FLI attached to the local memory pool."""
+    fli_ = fli.FLInterface(main_ch=the_channel, manager_ch=None)
+    return fli_
+
+
+def test_descriptor_to_channel_empty() -> None:
+    """Verify that `descriptor_to_channel` raises an exception when
+    provided with an empty descriptor."""
+    descriptor = ""
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    assert "empty" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    ["a", "ab", "abc", "x1", pathlib.Path(".").absolute().as_posix()],
+)
+def test_descriptor_to_channel_b64fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_channel` raises an exception when
+    provided with an incorrectly encoded descriptor.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    assert "base64" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    [str(uuid.uuid4())],
+)
+def test_descriptor_to_channel_channel_fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_channel` raises an exception when a correctly
+    formatted descriptor that does not describe a real channel is passed.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    # ensure we're receiving the right exception
+    assert "address" in ex.value.args[0]
+    assert "channel" in ex.value.args[0]
+
+
+def test_descriptor_to_channel_channel_not_available(the_channel: dch.Channel) -> None:
+    """Verify that `descriptor_to_channel` raises an exception when a channel
+    is no longer available.
+
+    :param the_channel: A dragon channel
+    """
+
+    # get a good descriptor & wipe out the channel so it can't be attached
+    descriptor = dragon_util.channel_to_descriptor(the_channel)
+    the_channel.destroy()
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    assert "address" in ex.value.args[0]
+
+
+def test_descriptor_to_channel_happy_path(the_channel: dch.Channel) -> None:
+    """Verify that `descriptor_to_channel` works as expected when provided
+    a valid descriptor
+
+    :param the_channel: A dragon channel
+    """
+
+    # get a good descriptor
+    descriptor = dragon_util.channel_to_descriptor(the_channel)
+
+    reattached = dragon_util.descriptor_to_channel(descriptor)
+    assert reattached
+
+    # and just make sure creation of the descriptor is transitive
+    assert dragon_util.channel_to_descriptor(reattached) == descriptor
+
+
+def test_descriptor_to_fli_empty() -> None:
+    """Verify that `descriptor_to_fli` raises an exception when
+    provided with an empty descriptor."""
+    descriptor = ""
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    assert "empty" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    ["a", "ab", "abc", "x1", pathlib.Path(".").absolute().as_posix()],
+)
+def test_descriptor_to_fli_b64fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_fli` raises an exception when
+    provided with an incorrectly encoded descriptor.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    assert "base64" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    [str(uuid.uuid4())],
+)
+def test_descriptor_to_fli_fli_fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_fli` raises an exception when a correctly
+    formatted descriptor that does not describe a real FLI is passed.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    # ensure we're receiving the right exception
+    assert "address" in ex.value.args[0]
+    assert "fli" in ex.value.args[0].lower()
+
+
+def test_descriptor_to_fli_fli_not_available(
+    the_fli: fli.FLInterface, the_channel: dch.Channel
+) -> None:
+    """Verify that `descriptor_to_fli` raises an exception when a channel
+    is no longer available.
+
+    :param the_fli: A dragon FLInterface
+    :param the_channel: A dragon channel
+    """
+
+    # get a good descriptor & wipe out the FLI so it can't be attached
+    descriptor = dragon_util.channel_to_descriptor(the_fli)
+    the_fli.destroy()
+    the_channel.destroy()
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    # ensure we're receiving the right exception
+    assert "address" in ex.value.args[0]
+
+
+def test_descriptor_to_fli_happy_path(the_fli: dch.Channel) -> None:
+    """Verify that `descriptor_to_fli` works as expected when provided
+    a valid descriptor
+
+    :param the_fli: A dragon FLInterface
+    """
+
+    # get a good descriptor
+    descriptor = dragon_util.channel_to_descriptor(the_fli)
+
+    reattached = dragon_util.descriptor_to_fli(descriptor)
+    assert reattached
+
+    # and just make sure creation of the descriptor is transitive
+    assert dragon_util.channel_to_descriptor(reattached) == descriptor
+
+
+def test_pool_to_descriptor_empty() -> None:
+    """Verify that `pool_to_descriptor` raises an exception when
+    provided with a null pool."""
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.pool_to_descriptor(None)
+
+
+def test_pool_to_happy_path(the_pool) -> None:
+    """Verify that `pool_to_descriptor` creates a descriptor
+    when supplied with a valid memory pool."""
+
+    descriptor = dragon_util.pool_to_descriptor(the_pool)
+    assert descriptor
diff --git a/tests/test_generator.py b/tests/test_generator.py
index 3915526a8b..f949d8f663 100644
--- a/tests/test_generator.py
+++ b/tests/test_generator.py
@@ -85,15 +85,13 @@ def as_executable_sequence(self):
 def mock_job() -> unittest.mock.MagicMock:
     """Fixture to create a mock Job."""
     job = unittest.mock.MagicMock(
-        **{
-            "entity": EchoHelloWorldEntity(),
-            "name": "test_job",
-            "get_launch_steps": unittest.mock.MagicMock(
-                side_effect=lambda: NotImplementedError()
-            ),
-        },
+        entity=EchoHelloWorldEntity(),
+        get_launch_steps=unittest.mock.MagicMock(
+            side_effect=lambda: NotImplementedError()
+        ),
         spec=Job,
     )
+    job.name = "test_job"
     yield job
 
 
diff --git a/tests/test_message_handler/__init__.py b/tests/test_message_handler/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/test_message_handler/test_build_model.py b/tests/test_message_handler/test_build_model.py
new file mode 100644
index 0000000000..56c1c8764c
--- /dev/null
+++ b/tests/test_message_handler/test_build_model.py
@@ -0,0 +1,72 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+
+def test_build_model_successful():
+    expected_data = b"model data"
+    expected_name = "model name"
+    expected_version = "v0.0.1"
+    model = handler.build_model(expected_data, expected_name, expected_version)
+    assert model.data == expected_data
+    assert model.name == expected_name
+    assert model.version == expected_version
+
+
+@pytest.mark.parametrize(
+    "data, name, version",
+    [
+        pytest.param(
+            100,
+            "model name",
+            "v0.0.1",
+            id="bad data type",
+        ),
+        pytest.param(
+            b"model data",
+            1,
+            "v0.0.1",
+            id="bad name type",
+        ),
+        pytest.param(
+            b"model data",
+            "model name",
+            0.1,
+            id="bad version type",
+        ),
+    ],
+)
+def test_build_model_unsuccessful(data, name, version):
+    with pytest.raises(ValueError):
+        model = handler.build_model(data, name, version)
diff --git a/tests/test_message_handler/test_build_model_key.py b/tests/test_message_handler/test_build_model_key.py
new file mode 100644
index 0000000000..6c9b3dc951
--- /dev/null
+++ b/tests/test_message_handler/test_build_model_key.py
@@ -0,0 +1,47 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+
+def test_build_model_key_successful():
+    fsd = "mock-feature-store-descriptor"
+    model_key = handler.build_model_key("tensor_key", fsd)
+    assert model_key.key == "tensor_key"
+    assert model_key.descriptor == fsd
+
+
+def test_build_model_key_unsuccessful():
+    with pytest.raises(ValueError):
+        fsd = "mock-feature-store-descriptor"
+        model_key = handler.build_model_key(100, fsd)
diff --git a/tests/test_message_handler/test_build_request_attributes.py b/tests/test_message_handler/test_build_request_attributes.py
new file mode 100644
index 0000000000..5b1e09b0aa
--- /dev/null
+++ b/tests/test_message_handler/test_build_request_attributes.py
@@ -0,0 +1,55 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+
+def test_build_torch_request_attributes_successful():
+    attribute = handler.build_torch_request_attributes("sparse")
+    assert attribute.tensorType == "sparse"
+
+
+def test_build_torch_request_attributes_unsuccessful():
+    with pytest.raises(ValueError):
+        attribute = handler.build_torch_request_attributes("invalid!")
+
+
+def test_build_tf_request_attributes_successful():
+    attribute = handler.build_tf_request_attributes(name="tfcnn", tensor_type="sparse")
+    assert attribute.tensorType == "sparse"
+    assert attribute.name == "tfcnn"
+
+
+def test_build_tf_request_attributes_unsuccessful():
+    with pytest.raises(ValueError):
+        attribute = handler.build_tf_request_attributes("tf_fail", "invalid!")
diff --git a/tests/test_message_handler/test_build_tensor_desc.py b/tests/test_message_handler/test_build_tensor_desc.py
new file mode 100644
index 0000000000..45126fb16c
--- /dev/null
+++ b/tests/test_message_handler/test_build_tensor_desc.py
@@ -0,0 +1,90 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+
+@pytest.mark.parametrize(
+    "dtype, order, dimension",
+    [
+        pytest.param(
+            "int8",
+            "c",
+            [3, 2, 5],
+            id="small torch tensor",
+        ),
+        pytest.param(
+            "int64",
+            "c",
+            [1040, 1040, 3],
+            id="medium torch tensor",
+        ),
+    ],
+)
+def test_build_tensor_descriptor_successful(dtype, order, dimension):
+    built_tensor_descriptor = handler.build_tensor_descriptor(order, dtype, dimension)
+    assert built_tensor_descriptor is not None
+    assert built_tensor_descriptor.order == order
+    assert built_tensor_descriptor.dataType == dtype
+    for i, j in zip(built_tensor_descriptor.dimensions, dimension):
+        assert i == j
+
+
+@pytest.mark.parametrize(
+    "dtype, order, dimension",
+    [
+        pytest.param(
+            "bad_order",
+            "int8",
+            [3, 2, 5],
+            id="bad order type",
+        ),
+        pytest.param(
+            "f",
+            "bad_num_type",
+            [3, 2, 5],
+            id="bad numerical type",
+        ),
+        pytest.param(
+            "f",
+            "int8",
+            "bad shape type",
+            id="bad shape type",
+        ),
+    ],
+)
+def test_build_tensor_descriptor_unsuccessful(dtype, order, dimension):
+    with pytest.raises(ValueError):
+        built_tensor_descriptor = handler.build_tensor_descriptor(
+            order, dtype, dimension
+        )
diff --git a/tests/test_message_handler/test_build_tensor_key.py b/tests/test_message_handler/test_build_tensor_key.py
new file mode 100644
index 0000000000..6a28b80c4f
--- /dev/null
+++ b/tests/test_message_handler/test_build_tensor_key.py
@@ -0,0 +1,46 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+
+def test_build_tensor_key_successful():
+    fsd = "mock-feature-store-descriptor"
+    tensor_key = handler.build_tensor_key("tensor_key", fsd)
+    assert tensor_key.key == "tensor_key"
+
+
+def test_build_tensor_key_unsuccessful():
+    with pytest.raises(ValueError):
+        fsd = "mock-feature-store-descriptor"
+        tensor_key = handler.build_tensor_key(100, fsd)
diff --git a/tests/test_message_handler/test_output_descriptor.py b/tests/test_message_handler/test_output_descriptor.py
new file mode 100644
index 0000000000..beb9a47657
--- /dev/null
+++ b/tests/test_message_handler/test_output_descriptor.py
@@ -0,0 +1,78 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+fsd = "mock-feature-store-descriptor"
+tensor_key = handler.build_tensor_key("key", fsd)
+
+
+@pytest.mark.parametrize(
+    "order, keys, dtype, dimension",
+    [
+        pytest.param("c", [tensor_key], "int8", [1, 2, 3, 4], id="all specified"),
+        pytest.param(
+            "c", [tensor_key, tensor_key], "none", [1, 2, 3, 4], id="none dtype"
+        ),
+        pytest.param("c", [tensor_key], "int8", [], id="empty dimensions"),
+        pytest.param("c", [], "int8", [1, 2, 3, 4], id="empty keys"),
+    ],
+)
+def test_build_output_tensor_descriptor_successful(dtype, keys, order, dimension):
+    built_descriptor = handler.build_output_tensor_descriptor(
+        order, keys, dtype, dimension
+    )
+    assert built_descriptor is not None
+    assert built_descriptor.order == order
+    assert len(built_descriptor.optionalKeys) == len(keys)
+    assert built_descriptor.optionalDatatype == dtype
+    for i, j in zip(built_descriptor.optionalDimension, dimension):
+        assert i == j
+
+
+@pytest.mark.parametrize(
+    "order, keys, dtype, dimension",
+    [
+        pytest.param("bad_order", [], "int8", [3, 2, 5], id="bad order type"),
+        pytest.param(
+            "f", [tensor_key], "bad_num_type", [3, 2, 5], id="bad numerical type"
+        ),
+        pytest.param("f", [tensor_key], "int8", "bad shape type", id="bad shape type"),
+        pytest.param("f", ["tensor_key"], "int8", [3, 2, 5], id="bad key type"),
+    ],
+)
+def test_build_output_tensor_descriptor_unsuccessful(order, keys, dtype, dimension):
+    with pytest.raises(ValueError):
+        built_tensor = handler.build_output_tensor_descriptor(
+            order, keys, dtype, dimension
+        )
diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py
new file mode 100644
index 0000000000..a60818f7dd
--- /dev/null
+++ b/tests/test_message_handler/test_request.py
@@ -0,0 +1,449 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+fsd = "mock-feature-store-descriptor"
+
+model_key = MessageHandler.build_model_key("model_key", fsd)
+model = MessageHandler.build_model(b"model data", "model_name", "v0.0.1")
+
+input_key1 = MessageHandler.build_tensor_key("input_key1", fsd)
+input_key2 = MessageHandler.build_tensor_key("input_key2", fsd)
+
+output_key1 = MessageHandler.build_tensor_key("output_key1", fsd)
+output_key2 = MessageHandler.build_tensor_key("output_key2", fsd)
+
+output_descriptor1 = MessageHandler.build_output_tensor_descriptor(
+    "c", [output_key1, output_key2], "int64", []
+)
+output_descriptor2 = MessageHandler.build_output_tensor_descriptor("f", [], "auto", [])
+output_descriptor3 = MessageHandler.build_output_tensor_descriptor(
+    "c", [output_key1], "none", [1, 2, 3]
+)
+torch_attributes = MessageHandler.build_torch_request_attributes("sparse")
+tf_attributes = MessageHandler.build_tf_request_attributes(
+    name="tf", tensor_type="sparse"
+)
+
+tensor_1 = MessageHandler.build_tensor_descriptor("c", "int8", [1])
+tensor_2 = MessageHandler.build_tensor_descriptor("c", "int64", [3, 2])
+tensor_3 = MessageHandler.build_tensor_descriptor("f", "int8", [1])
+tensor_4 = MessageHandler.build_tensor_descriptor("f", "int64", [3, 2])
+
+
+tf_indirect_request = MessageHandler.build_request(
+    b"reply",
+    model,
+    [input_key1, input_key2],
+    [output_key1, output_key2],
+    [output_descriptor1, output_descriptor2, output_descriptor3],
+    tf_attributes,
+)
+
+tf_direct_request = MessageHandler.build_request(
+    b"reply",
+    model,
+    [tensor_3, tensor_4],
+    [],
+    [output_descriptor1, output_descriptor2],
+    tf_attributes,
+)
+
+torch_indirect_request = MessageHandler.build_request(
+    b"reply",
+    model,
+    [input_key1, input_key2],
+    [output_key1, output_key2],
+    [output_descriptor1, output_descriptor2, output_descriptor3],
+    torch_attributes,
+)
+
+torch_direct_request = MessageHandler.build_request(
+    b"reply",
+    model,
+    [tensor_1, tensor_2],
+    [],
+    [output_descriptor1, output_descriptor2],
+    torch_attributes,
+)
+
+
+@pytest.mark.parametrize(
+    "reply_channel, model, input, output, output_descriptors, custom_attributes",
+    [
+        pytest.param(
+            "reply channel",
+            model_key,
+            [input_key1, input_key2],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            torch_attributes,
+        ),
+        pytest.param(
+            "another reply channel",
+            model,
+            [input_key1],
+            [output_key2],
+            [output_descriptor1],
+            tf_attributes,
+        ),
+        pytest.param(
+            "another reply channel",
+            model,
+            [input_key1],
+            [output_key2],
+            [output_descriptor1],
+            torch_attributes,
+        ),
+        pytest.param(
+            "reply channel",
+            model_key,
+            [input_key1],
+            [output_key1],
+            [output_descriptor1],
+            None,
+        ),
+    ],
+)
+def test_build_request_indirect_successful(
+    reply_channel, model, input, output, output_descriptors, custom_attributes
+):
+    built_request = MessageHandler.build_request(
+        reply_channel,
+        model,
+        input,
+        output,
+        output_descriptors,
+        custom_attributes,
+    )
+    assert built_request is not None
+    assert built_request.replyChannel.descriptor == reply_channel
+    if built_request.model.which() == "key":
+        assert built_request.model.key.key == model.key
+    else:
+        assert built_request.model.data.data == model.data
+        assert built_request.model.data.name == model.name
+        assert built_request.model.data.version == model.version
+    assert built_request.input.which() == "keys"
+    assert built_request.input.keys[0].key == input[0].key
+    assert len(built_request.input.keys) == len(input)
+    assert len(built_request.output) == len(output)
+    for i, j in zip(built_request.outputDescriptors, output_descriptors):
+        assert i.order == j.order
+    if built_request.customAttributes.which() == "tf":
+        assert (
+            built_request.customAttributes.tf.tensorType == custom_attributes.tensorType
+        )
+    elif built_request.customAttributes.which() == "torch":
+        assert (
+            built_request.customAttributes.torch.tensorType
+            == custom_attributes.tensorType
+        )
+    else:
+        assert built_request.customAttributes.none == custom_attributes
+
+
+@pytest.mark.parametrize(
+    "reply_channel, model, input, output, output_descriptors, custom_attributes",
+    [
+        pytest.param(
+            [],
+            model_key,
+            [input_key1, input_key2],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            tf_attributes,
+            id="bad channel",
+        ),
+        pytest.param(
+            "reply channel",
+            "bad model",
+            [input_key1],
+            [output_key2],
+            [output_descriptor1],
+            torch_attributes,
+            id="bad model",
+        ),
+        pytest.param(
+            "reply channel",
+            model_key,
+            ["input_key1", "input_key2"],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            tf_attributes,
+            id="bad inputs",
+        ),
+        pytest.param(
+            "reply channel",
+            model_key,
+            [torch_attributes],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            torch_attributes,
+            id="bad input schema type",
+        ),
+        pytest.param(
+            "reply channel",
+            model_key,
+            [input_key1],
+            ["output_key1", "output_key2"],
+            [output_descriptor1],
+            tf_attributes,
+            id="bad outputs",
+        ),
+        pytest.param(
+            "reply channel",
+            model_key,
+            [input_key1],
+            [torch_attributes],
+            [output_descriptor1],
+            tf_attributes,
+            id="bad output schema type",
+        ),
+        pytest.param(
+            "reply channel",
+            model_key,
+            [input_key1],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            "bad attributes",
+            id="bad custom attributes",
+        ),
+        pytest.param(
+            "reply channel",
+            model_key,
+            [input_key1],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            model_key,
+            id="bad custom attributes schema type",
+        ),
+        pytest.param(
+            "reply channel",
+            model_key,
+            [input_key1],
+            [output_key1, output_key2],
+            "bad descriptors",
+            torch_attributes,
+            id="bad output descriptors",
+        ),
+    ],
+)
+def test_build_request_indirect_unsuccessful(
+    reply_channel, model, input, output, output_descriptors, custom_attributes
+):
+    with pytest.raises(ValueError):
+        built_request = MessageHandler.build_request(
+            reply_channel,
+            model,
+            input,
+            output,
+            output_descriptors,
+            custom_attributes,
+        )
+
+
+@pytest.mark.parametrize(
+    "reply_channel, model, input, output, output_descriptors, custom_attributes",
+    [
+        pytest.param(
+            "reply channel",
+            model_key,
+            [tensor_1, tensor_2],
+            [],
+            [output_descriptor2],
+            torch_attributes,
+        ),
+        pytest.param(
+            "another reply channel",
+            model,
+            [tensor_1],
+            [],
+            [output_descriptor3],
+            tf_attributes,
+        ),
+        pytest.param(
+            "another reply channel",
+            model,
+            [tensor_2],
+            [],
+            [output_descriptor1],
+            tf_attributes,
+        ),
+        pytest.param(
+            "another reply channel",
+            model,
+            [tensor_1],
+            [],
+            [output_descriptor1],
+            None,
+        ),
+    ],
+)
+def test_build_request_direct_successful(
+    reply_channel, model, input, output, output_descriptors, custom_attributes
+):
+    built_request = MessageHandler.build_request(
+        reply_channel,
+        model,
+        input,
+        output,
+        output_descriptors,
+        custom_attributes,
+    )
+    assert built_request is not None
+    assert built_request.replyChannel.descriptor == reply_channel
+    if built_request.model.which() == "key":
+        assert built_request.model.key.key == model.key
+    else:
+        assert built_request.model.data.data == model.data
+        assert built_request.model.data.name == model.name
+        assert built_request.model.data.version == model.version
+    assert built_request.input.which() == "descriptors"
+    assert len(built_request.input.descriptors) == len(input)
+    assert len(built_request.output) == len(output)
+    for i, j in zip(built_request.outputDescriptors, output_descriptors):
+        assert i.order == j.order
+    if built_request.customAttributes.which() == "tf":
+        assert (
+            built_request.customAttributes.tf.tensorType == custom_attributes.tensorType
+        )
+    elif built_request.customAttributes.which() == "torch":
+        assert (
+            built_request.customAttributes.torch.tensorType
+            == custom_attributes.tensorType
+        )
+    else:
+        assert built_request.customAttributes.none == custom_attributes
+
+
+@pytest.mark.parametrize(
+    "reply_channel, model, input, output, output_descriptors, custom_attributes",
+    [
+        pytest.param(
+            [],
+            model_key,
+            [tensor_3, tensor_4],
+            [],
+            [output_descriptor2],
+            tf_attributes,
+            id="bad channel",
+        ),
+        pytest.param(
+            b"reply channel",
+            "bad model",
+            [tensor_4],
+            [],
+            [output_descriptor2],
+            tf_attributes,
+            id="bad model",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            ["input_key1", "input_key2"],
+            [],
+            [output_descriptor2],
+            torch_attributes,
+            id="bad inputs",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [],
+            ["output_key1", "output_key2"],
+            [output_descriptor2],
+            tf_attributes,
+            id="bad outputs",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [tensor_4],
+            [],
+            [output_descriptor2],
+            "bad attributes",
+            id="bad custom attributes",
+        ),
+        pytest.param(
+            b"reply_channel",
+            model_key,
+            [tensor_3, tensor_4],
+            [],
+            ["output_descriptor2"],
+            torch_attributes,
+            id="bad output descriptors",
+        ),
+    ],
+)
+def test_build_request_direct_unsuccessful(
+    reply_channel, model, input, output, output_descriptors, custom_attributes
+):
+    with pytest.raises(ValueError):
+        built_request = MessageHandler.build_request(
+            reply_channel,
+            model,
+            input,
+            output,
+            output_descriptors,
+            custom_attributes,
+        )
+
+
+@pytest.mark.parametrize(
+    "req",
+    [
+        pytest.param(tf_indirect_request, id="tf indirect"),
+        pytest.param(tf_direct_request, id="tf direct"),
+        pytest.param(torch_indirect_request, id="indirect"),
+        pytest.param(torch_direct_request, id="direct"),
+    ],
+)
+def test_serialize_request_successful(req):
+    serialized = MessageHandler.serialize_request(req)
+    assert type(serialized) == bytes
+
+    deserialized = MessageHandler.deserialize_request(serialized)
+    assert deserialized.to_dict() == req.to_dict()
+
+
+def test_serialization_fails():
+    with pytest.raises(ValueError):
+        bad_request = MessageHandler.serialize_request(tensor_1)
+
+
+def test_deserialization_fails():
+    with pytest.raises(ValueError):
+        new_req = torch_direct_request.copy()
+        req_bytes = MessageHandler.serialize_request(new_req)
+        req_bytes = req_bytes + b"extra bytes"
+        deser = MessageHandler.deserialize_request(req_bytes)
diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py
new file mode 100644
index 0000000000..86774132ec
--- /dev/null
+++ b/tests/test_message_handler/test_response.py
@@ -0,0 +1,191 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+fsd = "mock-feature-store-descriptor"
+
+result_key1 = MessageHandler.build_tensor_key("result_key1", fsd)
+result_key2 = MessageHandler.build_tensor_key("result_key2", fsd)
+
+torch_attributes = MessageHandler.build_torch_response_attributes()
+tf_attributes = MessageHandler.build_tf_response_attributes()
+
+tensor1 = MessageHandler.build_tensor_descriptor("c", "int8", [1])
+tensor2 = MessageHandler.build_tensor_descriptor("c", "int64", [3, 2])
+
+
+tf_indirect_response = MessageHandler.build_response(
+    "complete",
+    "Success!",
+    [result_key1, result_key2],
+    tf_attributes,
+)
+
+tf_direct_response = MessageHandler.build_response(
+    "complete",
+    "Success again!",
+    [tensor2, tensor1],
+    tf_attributes,
+)
+
+torch_indirect_response = MessageHandler.build_response(
+    "complete",
+    "Success!",
+    [result_key1, result_key2],
+    torch_attributes,
+)
+
+torch_direct_response = MessageHandler.build_response(
+    "complete",
+    "Success again!",
+    [tensor1, tensor2],
+    torch_attributes,
+)
+
+
+@pytest.mark.parametrize(
+    "status, status_message, result, custom_attribute",
+    [
+        pytest.param(
+            200,
+            "Yay, it worked!",
+            [tensor1, tensor2],
+            None,
+            id="tensor descriptor list",
+        ),
+        pytest.param(
+            200,
+            "Yay, it worked!",
+            [result_key1, result_key2],
+            tf_attributes,
+            id="tensor key list",
+        ),
+    ],
+)
+def test_build_response_successful(status, status_message, result, custom_attribute):
+    response = MessageHandler.build_response(
+        status=status,
+        message=status_message,
+        result=result,
+        custom_attributes=custom_attribute,
+    )
+    assert response is not None
+    assert response.status == status
+    assert response.message == status_message
+    if response.result.which() == "keys":
+        assert response.result.keys[0].to_dict() == result[0].to_dict()
+    else:
+        assert response.result.descriptors[0].to_dict() == result[0].to_dict()
+
+
+@pytest.mark.parametrize(
+    "status, status_message, result, custom_attribute",
+    [
+        pytest.param(
+            "bad status",
+            "Yay, it worked!",
+            [tensor1, tensor2],
+            None,
+            id="bad status",
+        ),
+        pytest.param(
+            "complete",
+            200,
+            [tensor2],
+            torch_attributes,
+            id="bad status message",
+        ),
+        pytest.param(
+            "complete",
+            "Yay, it worked!",
+            ["result_key1", "result_key2"],
+            tf_attributes,
+            id="bad result",
+        ),
+        pytest.param(
+            "complete",
+            "Yay, it worked!",
+            [tf_attributes],
+            tf_attributes,
+            id="bad result type",
+        ),
+        pytest.param(
+            "complete",
+            "Yay, it worked!",
+            [tensor2, tensor1],
+            "custom attributes",
+            id="bad custom attributes",
+        ),
+        pytest.param(
+            "complete",
+            "Yay, it worked!",
+            [tensor2, tensor1],
+            result_key1,
+            id="bad custom attributes type",
+        ),
+    ],
+)
+def test_build_response_unsuccessful(status, status_message, result, custom_attribute):
+    with pytest.raises(ValueError):
+        response = MessageHandler.build_response(
+            status, status_message, result, custom_attribute
+        )
+
+
+@pytest.mark.parametrize(
+    "response",
+    [
+        pytest.param(torch_indirect_response, id="indirect"),
+        pytest.param(torch_direct_response, id="direct"),
+        pytest.param(tf_indirect_response, id="tf indirect"),
+        pytest.param(tf_direct_response, id="tf direct"),
+    ],
+)
+def test_serialize_response(response):
+    serialized = MessageHandler.serialize_response(response)
+    assert type(serialized) == bytes
+
+    deserialized = MessageHandler.deserialize_response(serialized)
+    assert deserialized.to_dict() == response.to_dict()
+
+
+def test_serialization_fails():
+    with pytest.raises(ValueError):
+        bad_response = MessageHandler.serialize_response(result_key1)
+
+
+def test_deserialization_fails():
+    with pytest.raises(ValueError):
+        new_resp = torch_direct_response.copy()
+        resp_bytes = MessageHandler.serialize_response(new_resp)
+        resp_bytes = resp_bytes + b"extra bytes"
+        deser = MessageHandler.deserialize_response(resp_bytes)
diff --git a/tests/test_node_prioritizer.py b/tests/test_node_prioritizer.py
new file mode 100644
index 0000000000..905c0ecc90
--- /dev/null
+++ b/tests/test_node_prioritizer.py
@@ -0,0 +1,553 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import random
+import threading
+import typing as t
+
+import pytest
+
+from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+# The tests in this file belong to the group_b group
+pytestmark = pytest.mark.group_b
+
+
+logger = get_logger(__name__)
+
+
+class MockNode:
+    def __init__(self, hostname: str, num_cpus: int, num_gpus: int) -> None:
+        self.hostname = hostname
+        self.num_cpus = num_cpus
+        self.num_gpus = num_gpus
+
+
+def mock_node_hosts(
+    num_cpu_nodes: int, num_gpu_nodes: int
+) -> t.Tuple[t.List[MockNode], t.List[MockNode]]:
+    cpu_hosts = [f"cpu-node-{i}" for i in range(num_cpu_nodes)]
+    gpu_hosts = [f"gpu-node-{i}" for i in range(num_gpu_nodes)]
+
+    return cpu_hosts, gpu_hosts
+
+
+def mock_node_builder(num_cpu_nodes: int, num_gpu_nodes: int) -> t.List[MockNode]:
+    nodes = []
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+
+    nodes.extend(MockNode(hostname, 4, 0) for hostname in cpu_hosts)
+    nodes.extend(MockNode(hostname, 4, 4) for hostname in gpu_hosts)
+
+    return nodes
+
+
+def test_node_prioritizer_init_null() -> None:
+    """Verify that the priorizer reports failures to send a valid node set
+    if a null value is passed"""
+    lock = threading.RLock()
+    with pytest.raises(SmartSimError) as ex:
+        NodePrioritizer(None, lock)
+
+    assert "Missing" in ex.value.args[0]
+
+
+def test_node_prioritizer_init_empty() -> None:
+    """Verify that the priorizer reports failures to send a valid node set
+    if an empty list is passed"""
+    lock = threading.RLock()
+    with pytest.raises(SmartSimError) as ex:
+        NodePrioritizer([], lock)
+
+    assert "Missing" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "num_cpu_nodes,num_gpu_nodes", [(1, 1), (2, 1), (1, 2), (8, 4), (1000, 200)]
+)
+def test_node_prioritizer_init_ok(num_cpu_nodes: int, num_gpu_nodes: int) -> None:
+    """Verify that initialization with a valid node list results in the
+    appropriate cpu & gpu ref counts, and complete ref map"""
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    # perform prioritizer initialization
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # get a copy of all the expected host names
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    all_hosts = cpu_hosts + gpu_hosts
+    assert len(all_hosts) == num_cpu_nodes + num_gpu_nodes
+
+    # verify tracking data is initialized correctly for all nodes
+    for hostname in all_hosts:
+        # show that the ref map is tracking the node
+        assert hostname in p._nodes
+
+        tracking_info = p.get_tracking_info(hostname)
+
+        # show that the node is created w/zero ref counts
+        assert tracking_info.num_refs == 0
+
+        # show that the node is created and marked as not dirty (unchanged)
+        # assert tracking_info.is_dirty == False
+
+    # iterate through known cpu node keys and verify prioritizer initialization
+    for hostname in cpu_hosts:
+        # show that the device ref counters are appropriately assigned
+        cpu_ref = next((n for n in p._cpu_refs if n.hostname == hostname), None)
+        assert cpu_ref, "CPU-only node not found in cpu ref set"
+
+        gpu_ref = next((n for n in p._gpu_refs if n.hostname == hostname), None)
+        assert not gpu_ref, "CPU-only node should not be found in gpu ref set"
+
+    # iterate through known GPU node keys and verify prioritizer initialization
+    for hostname in gpu_hosts:
+        # show that the device ref counters are appropriately assigned
+        gpu_ref = next((n for n in p._gpu_refs if n.hostname == hostname), None)
+        assert gpu_ref, "GPU-only node not found in gpu ref set"
+
+        cpu_ref = next((n for n in p._cpu_refs if n.hostname == hostname), None)
+        assert not cpu_ref, "GPU-only node should not be found in cpu ref set"
+
+    # verify we have all hosts in the ref map
+    assert set(p._nodes.keys()) == set(all_hosts)
+
+    # verify we have no extra hosts in ref map
+    assert len(p._nodes.keys()) == len(set(all_hosts))
+
+
+def test_node_prioritizer_direct_increment() -> None:
+    """Verify that performing the increment operation causes the expected
+    side effect on the intended records"""
+
+    num_cpu_nodes, num_gpu_nodes = 32, 8
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    exclude_index = 2
+    exclude_host0 = cpu_hosts[exclude_index]
+    exclude_host1 = gpu_hosts[exclude_index]
+    exclusions = [exclude_host0, exclude_host1]
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # let's increment each element in a predictable way and verify
+    for node in nodes:
+        if node.hostname in exclusions:
+            # expect 1 cpu and 1 gpu node at zero and not incremented
+            continue
+
+        if node.num_gpus == 0:
+            num_increments = random.randint(0, num_cpu_nodes - 1)
+        else:
+            num_increments = random.randint(0, num_gpu_nodes - 1)
+
+        # increment this node some random number of times
+        for _ in range(num_increments):
+            p.increment(node.hostname)
+
+        # ... and verify the correct incrementing is applied
+        tracking_info = p.get_tracking_info(node.hostname)
+        assert tracking_info.num_refs == num_increments
+
+    # verify the excluded cpu node was never changed
+    tracking_info0 = p.get_tracking_info(exclude_host0)
+    assert tracking_info0.num_refs == 0
+
+    # verify the excluded gpu node was never changed
+    tracking_info1 = p.get_tracking_info(exclude_host1)
+    assert tracking_info1.num_refs == 0
+
+
+def test_node_prioritizer_indirect_increment() -> None:
+    """Verify that performing the increment operation indirectly affects
+    each available node until we run out of nodes to return"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # verify starting state
+    for node in p._nodes.values():
+        tracking_info = p.get_tracking_info(node.hostname)
+
+        assert node.num_refs == 0  # <--- ref count starts at zero
+        assert tracking_info.num_refs == 0  # <--- ref count starts at zero
+
+    # perform indirect
+    for node in p._nodes.values():
+        tracking_info = p.get_tracking_info(node.hostname)
+
+        # apply `next` operation and verify tracking info reflects new ref
+        node = p.next(PrioritizerFilter.CPU)
+        tracking_info = p.get_tracking_info(node.hostname)
+
+        # verify side-effects
+        assert tracking_info.num_refs > 0  # <--- ref count should now be > 0
+
+        # we expect it to give back only "clean" nodes from next*
+        assert tracking_info.is_dirty == False  # NOTE: this is "hidden" by protocol
+
+    # every node should be incremented now. prioritizer shouldn't have anything to give
+    tracking_info = p.next(PrioritizerFilter.CPU)
+    assert tracking_info is None  # <--- get_next shouldn't have any nodes to give
+
+
+def test_node_prioritizer_indirect_decrement_availability() -> None:
+    """Verify that a node who is decremented (dirty) is made assignable
+    on a subsequent request"""
+
+    num_cpu_nodes, num_gpu_nodes = 1, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # increment our only node...
+    p.increment(cpu_hosts[0])
+
+    tracking_info = p.next()
+    assert tracking_info is None, "No nodes should be assignable"
+
+    # perform a decrement...
+    p.decrement(cpu_hosts[0])
+
+    # ... and confirm that the node is available again
+    tracking_info = p.next()
+    assert tracking_info is not None, "A node should be assignable"
+
+
+def test_node_prioritizer_multi_increment() -> None:
+    """Verify that retrieving multiple nodes via `next_n` API correctly
+    increments reference counts and returns appropriate results"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    assert p.get_tracking_info(cpu_hosts[0]).num_refs > 0
+
+    p.increment(cpu_hosts[2])
+    assert p.get_tracking_info(cpu_hosts[2]).num_refs > 0
+
+    p.increment(cpu_hosts[4])
+    assert p.get_tracking_info(cpu_hosts[4]).num_refs > 0
+
+    # use next_n w/the minimum allowed value
+    all_tracking_info = p.next_n(1, PrioritizerFilter.CPU)  # <---- next_n(1)
+
+    # confirm the number requested is honored
+    assert len(all_tracking_info) == 1
+    # ensure no unavailable node is returned
+    assert all_tracking_info[0].hostname not in [
+        cpu_hosts[0],
+        cpu_hosts[2],
+        cpu_hosts[4],
+    ]
+
+    # use next_n w/value that exceeds available number of open nodes
+    # 3 direct increments in setup, 1 out of next_n(1), 4 left
+    all_tracking_info = p.next_n(5, PrioritizerFilter.CPU)
+
+    # confirm that no nodes are returned, even though 4 out of 5 requested are available
+    assert len(all_tracking_info) == 0
+
+
+def test_node_prioritizer_multi_increment_validate_n() -> None:
+    """Verify that retrieving multiple nodes via `next_n` API correctly
+    reports failures when the request size is above pool size"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # we have 8 total cpu nodes available... request too many nodes
+    all_tracking_info = p.next_n(9, PrioritizerFilter.CPU)
+    assert len(all_tracking_info) == 0
+
+    all_tracking_info = p.next_n(num_cpu_nodes * 1000, PrioritizerFilter.CPU)
+    assert len(all_tracking_info) == 0
+
+
+def test_node_prioritizer_indirect_direct_interleaved_increments() -> None:
+    """Verify that interleaving indirect and direct increments results in
+    expected ref counts"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 4
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # perform some set of non-popped increments
+    p.increment(gpu_hosts[1])
+    p.increment(gpu_hosts[3])
+    p.increment(gpu_hosts[3])
+
+    # increment 0th item 1x
+    p.increment(cpu_hosts[0])
+
+    # increment 3th item 2x
+    p.increment(cpu_hosts[3])
+    p.increment(cpu_hosts[3])
+
+    # increment last item 3x
+    p.increment(cpu_hosts[7])
+    p.increment(cpu_hosts[7])
+    p.increment(cpu_hosts[7])
+
+    tracking_info = p.get_tracking_info(gpu_hosts[1])
+    assert tracking_info.num_refs == 1
+
+    tracking_info = p.get_tracking_info(gpu_hosts[3])
+    assert tracking_info.num_refs == 2
+
+    nodes = [n for n in p._nodes.values() if n.num_refs == 0 and n.num_gpus == 0]
+
+    # we should skip the 0-th item in the heap due to direct increment
+    tracking_info = p.next(PrioritizerFilter.CPU)
+    assert tracking_info.num_refs == 1
+    # confirm we get a cpu node
+    assert "cpu-node" in tracking_info.hostname
+
+    # this should pull the next item right out
+    tracking_info = p.next(PrioritizerFilter.CPU)
+    assert tracking_info.num_refs == 1
+    assert "cpu-node" in tracking_info.hostname
+
+    # ensure we pull from gpu nodes and the 0th item is returned
+    tracking_info = p.next(PrioritizerFilter.GPU)
+    assert tracking_info.num_refs == 1
+    assert "gpu-node" in tracking_info.hostname
+
+    # we should step over the 3-th node on this iteration
+    tracking_info = p.next(PrioritizerFilter.CPU)
+    assert tracking_info.num_refs == 1
+    assert "cpu-node" in tracking_info.hostname
+
+    # and ensure that heap also steps over a direct increment
+    tracking_info = p.next(PrioritizerFilter.GPU)
+    assert tracking_info.num_refs == 1
+    assert "gpu-node" in tracking_info.hostname
+
+    # and another GPU request should return nothing
+    tracking_info = p.next(PrioritizerFilter.GPU)
+    assert tracking_info is None
+
+
+def test_node_prioritizer_decrement_floor() -> None:
+    """Verify that repeatedly decrementing ref counts does not
+    allow negative ref counts"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 4
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # try a ton of decrements on all the items in the prioritizer
+    for _ in range(len(nodes) * 100):
+        index = random.randint(0, num_cpu_nodes - 1)
+        p.decrement(cpu_hosts[index])
+
+        index = random.randint(0, num_gpu_nodes - 1)
+        p.decrement(gpu_hosts[index])
+
+    for node in nodes:
+        tracking_info = p.get_tracking_info(node.hostname)
+        assert tracking_info.num_refs == 0
+
+
+@pytest.mark.parametrize("num_requested", [1, 2, 3])
+def test_node_prioritizer_multi_increment_subheap(num_requested: int) -> None:
+    """Verify that retrieving multiple nodes via `next_n` API correctly
+    increments reference counts and returns appropriate results
+    when requesting an in-bounds number of nodes"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+    p.increment(cpu_hosts[4])
+
+    hostnames = [cpu_hosts[0], cpu_hosts[1], cpu_hosts[2], cpu_hosts[3], cpu_hosts[5]]
+
+    # request n == {num_requested} nodes from set of 3 available
+    all_tracking_info = p.next_n(
+        num_requested,
+        hosts=hostnames,
+    )  # <---- w/0,2,4 assigned, only 1,3,5 from hostnames can work
+
+    # all parameterizations should result in a matching output size
+    assert len(all_tracking_info) == num_requested
+
+
+def test_node_prioritizer_multi_increment_subheap_assigned() -> None:
+    """Verify that retrieving multiple nodes via `next_n` API does
+    not return anything when the number requested cannot be satisfied
+    by the given subheap due to prior assignment"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+
+    hostnames = [
+        cpu_hosts[0],
+        "x" + cpu_hosts[2],
+    ]  # <--- we can't get 2 from 1 valid node name
+
+    # request n == {num_requested} nodes from set of 3 available
+    num_requested = 2
+    all_tracking_info = p.next_n(num_requested, hosts=hostnames)
+
+    # w/0,2 assigned, nothing can be returned
+    assert len(all_tracking_info) == 0
+
+
+def test_node_prioritizer_empty_subheap_next_w_no_hosts() -> None:
+    """Verify that retrieving multiple nodes via `next_n` API does
+    with an empty host list uses the entire available host list"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+
+    hostnames = []
+
+    # request n == {num_requested} nodes from set of 3 available
+    num_requested = 1
+    node = p.next(hosts=hostnames)
+    assert node
+
+    # assert "No hostnames provided" == ex.value.args[0]
+
+
+def test_node_prioritizer_empty_subheap_next_n_w_hosts() -> None:
+    """Verify that retrieving multiple nodes via `next_n` API does
+    not blow up with an empty host list"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+
+    hostnames = []
+
+    # request n == {num_requested} nodes from set of 3 available
+    num_requested = 1
+    node = p.next_n(num_requested, hosts=hostnames)
+    assert node is not None
+
+
+@pytest.mark.parametrize("num_requested", [-100, -1, 0])
+def test_node_prioritizer_empty_subheap_next_n(num_requested: int) -> None:
+    """Verify that retrieving a node via `next_n` API does
+    not allow a request with num_items < 1"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+
+    # request n == {num_requested} nodes from set of 3 available
+    with pytest.raises(ValueError) as ex:
+        p.next_n(num_requested)
+
+    assert "Number of items requested" in ex.value.args[0]
+
+
+@pytest.mark.parametrize("num_requested", [-100, -1, 0])
+def test_node_prioritizer_empty_subheap_next_n(num_requested: int) -> None:
+    """Verify that retrieving multiple nodes via `next_n` API does
+    not allow a request with num_items < 1"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+
+    hostnames = [cpu_hosts[0], cpu_hosts[2]]
+
+    # request n == {num_requested} nodes from set of 3 available
+    with pytest.raises(ValueError) as ex:
+        p.next_n(num_requested, hosts=hostnames)
+
+    assert "Number of items requested" in ex.value.args[0]