diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1b6c763515..63622ebb08 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -50,6 +50,10 @@
 # ON_QUARTZ:
 # Should the Quartz pipeline run? Set to  "ON" or "OFF" to enable/disable.
 #
+# SPACK_PREFIX: prefix used for shared spack installation.
+# Usually this a spack version number that matches the version set in the uberenv_config.json file. 
+# Spack installs go in /usr/workspace/sundials/spack_installs/${SPACK_PREFIX}/$(hostname).
+#
 # SHARED_SPACK:
 # If "ON", then a shared spack install that has been pre-configured is utilized.
 # If "OFF", then a new spack instance is created for every build (meaning all TPLs have to be installed).
@@ -75,6 +79,8 @@ variables:
   VERBOSE_TEST: "OFF"
   ON_LASSEN: "ON"
   ON_QUARTZ: "ON"
+  ON_CORONA: "ON"
+  SPACK_PREFIX: "v0.19.1"
   SHARED_SPACK: "UPSTREAM"
   BENCHMARK: "OFF"
   BENCHMARK_NNODES: 4
@@ -89,7 +95,7 @@ stages:
   - q_build_and_test
   - l_build_and_test
   - l_build_and_bench
-  # - c_build_and_test
+  - c_build_and_test
 
 # These are also templates (.name) that define project specific build commands.
 # If an allocation exist with the name defined in this pipeline, the job will
@@ -101,9 +107,10 @@ stages:
       --job-name=${ALLOC_NAME} .gitlab/build_and_test.sh
 
 # Corona
-.build_toss_3_x86_64_ib_corona_script:
+.build_toss_4_x86_64_ib_corona_script:
   script:
-    - srun -p mi60 --interactive -t ${DEFAULT_TIME} -N 1 .gitlab/build_and_test.sh
+    - echo ${ALLOC_NAME}
+    - flux alloc -N 1 -t ${DEFAULT_TIME}  .gitlab/build_and_test.sh
 
 # CORAL systems use spectrum LSF instead of SLURM
 .build_blueos_3_ppc64le_ib_script:
@@ -123,3 +130,5 @@ include:
   - local: .gitlab/quartz-jobs.yml
   - local: .gitlab/lassen-templates.yml
   - local: .gitlab/lassen-jobs.yml
+  - local: .gitlab/corona-templates.yml
+  - local: .gitlab/corona-jobs.yml
diff --git a/.gitlab/build_and_test.sh b/.gitlab/build_and_test.sh
index 51d5bc63bf..1bb0aec5bf 100755
--- a/.gitlab/build_and_test.sh
+++ b/.gitlab/build_and_test.sh
@@ -20,6 +20,7 @@ job_unique_id=${CI_JOB_ID:-""}
 sys_type=${SYS_TYPE:-""}
 py_env_path=${PYTHON_ENVIRONMENT_PATH:-""}
 
+spack_prefix=${SHARED_SPACK_PREFIX:-"v0.19.1"}
 shared_spack=${SHARED_SPACK:-"UPSTREAM"}
 
 # Dependencies
@@ -46,8 +47,16 @@ hostname=${hostname%%[0-9]*}
 BUILD_JOBS=${BUILD_JOBS:-"1"}
 
 # load newer python to try the clingo concretizer
-echo "module load python/3.8.2"
-module load python/3.8.2
+# machine specific loads
+if [[ "${hostname}" == "corona" ]]; then
+    echo "module load python/3.9.12"
+    module load python/3.9.12
+    echo "module load rocm/5.4.1"
+    module load rocm/5.4.1
+else
+    echo "module load python/3.8.2"
+    module load python/3.8.2
+fi
 
 if [[ "${option}" != "--build-only" && "${option}" != "--test-only" ]]
 then
@@ -89,7 +98,7 @@ then
 
     if [[ -d /usr/workspace/sundials ]]
     then
-        upstream="/usr/workspace/sundials/spack_installs/${hostname}"
+        upstream="/usr/workspace/sundials/spack_installs/${spack_prefix}/${hostname}"
         mkdir -p "${upstream}"
         upstream_opt="--upstream=${upstream}"
     fi
@@ -174,7 +183,7 @@ then
     mkdir -p "${build_dir}" && cd "${build_dir}"
 
     date
-    
+
     $cmake_exe --version
 
     # configure
diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml
new file mode 100644
index 0000000000..450a9197ba
--- /dev/null
+++ b/.gitlab/corona-jobs.yml
@@ -0,0 +1,37 @@
+# ------------------------------------------------------------------------------
+# SUNDIALS Copyright Start
+# Copyright (c) 2002-2021, Lawrence Livermore National Security
+# and Southern Methodist University.
+# All rights reserved.
+#
+# See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# SUNDIALS Copyright End
+# ------------------------------------------------------------------------------
+
+# ------------------------------------------------------------------------------
+# HIP
+# ------------------------------------------------------------------------------
+
+# Builds with Hip
+corona_clang_hip:
+  parallel:
+    matrix:
+      - COMPILER_SPEC: rocmcc@5.4.1
+        AMDGPU_TARGET: [gfx906]
+  variables:
+    SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 precision=double amdgpu_target=${AMDGPU_TARGET} scheduler=flux +rocm+mpi"
+  extends: .corona_build_and_test
+
+# ------------------------------------------------------------------------------
+# HIP + TPLs
+# ------------------------------------------------------------------------------
+corona_clang_hip_tpls:
+  parallel:
+    matrix:
+      - COMPILER_SPEC: rocmcc@5.4.1
+        AMDGPU_TARGET: [gfx906]
+  variables:
+    SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 precision=double ~int64 amdgpu_target=${AMDGPU_TARGET} scheduler=flux +rocm+mpi+magma+raja ^magma+rocm amdgpu_target=${AMDGPU_TARGET} ^raja+rocm~openmp~examples~exercises amdgpu_target=${AMDGPU_TARGET}"
+  extends: .corona_build_and_test
diff --git a/.gitlab/corona-templates.yml b/.gitlab/corona-templates.yml
new file mode 100644
index 0000000000..a901f4fa48
--- /dev/null
+++ b/.gitlab/corona-templates.yml
@@ -0,0 +1,35 @@
+# ------------------------------------------------------------------------------
+# SUNDIALS Copyright Start
+# Copyright (c) 2002-2021, Lawrence Livermore National Security
+# and Southern Methodist University.
+# All rights reserved.
+#
+# See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# SUNDIALS Copyright End
+# ------------------------------------------------------------------------------
+
+# ------------------------------------------------------------------------------
+# Tags and rules to run tests on Corona
+# ------------------------------------------------------------------------------
+
+# Generic Corona build job, extending build script for Toss 4 x86_64 Systems
+.corona_build_and_test:
+  tags:
+    - shell
+    - corona
+  extends: [.build_toss_4_x86_64_ib_corona_script]
+  stage: c_build_and_test
+  needs: []
+  artifacts:
+    paths:
+      - spack-*.txt
+      - build_*/*
+    when: always
+  rules:
+    # Don't run if...
+    - if: '$CI_COMMIT_BRANCH =~ /_cnone/ || $ON_CORONA == "OFF" || $BENCHMARK == "ON"'
+      when: never
+    # Default is to run if previous stage succeeded
+    - when: on_success
diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml
index 7dc51e99d4..388b95ec08 100644
--- a/.gitlab/lassen-jobs.yml
+++ b/.gitlab/lassen-jobs.yml
@@ -78,7 +78,7 @@ lassen_gcc_cuda_tpls:
       - COMPILER_SPEC: gcc@8.3.1
         CUDA_SPEC: [cuda@11.5.0]
   variables:
-    SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 precision=double ~int64 +mpi+openmp+cuda+raja+magma+superlu-dist cuda_arch=70 ^superlu-dist+cuda cuda_arch=70 ^magma+cuda cuda_arch=70 ^raja+cuda~openmp~examples~exercises cuda_arch=70 ^${CUDA_SPEC}"
+    SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 precision=double ~int64 +mpi+openmp+cuda+raja+magma+superlu-dist+petsc+hypre+ginkgo cuda_arch=70 ^ginkgo+cuda cuda_arch=70 ^hypre ^petsc+cuda cuda_arch=70 ^superlu-dist+cuda cuda_arch=70 ^magma+cuda cuda_arch=70 ^raja+cuda~openmp~examples~exercises cuda_arch=70 ^${CUDA_SPEC}"
   extends: .lassen_build_and_test
 
 # ------------------------------------------------------------------------------
diff --git a/.gitlab/quartz-jobs.yml b/.gitlab/quartz-jobs.yml
index 6e4b87a236..93599c3188 100644
--- a/.gitlab/quartz-jobs.yml
+++ b/.gitlab/quartz-jobs.yml
@@ -55,11 +55,11 @@
 quartz_clang_tpls:
   parallel:
     matrix:
-      - COMPILER_SPEC: clang@12.0.0
+      - COMPILER_SPEC: clang@12.0.1
         INDEX_SPEC: [~int64]
         PRECISION_SPEC: [double]
   variables:
-    SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre +superlu-dist +lapack +klu"
+    SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre +superlu-dist +lapack +klu +petsc ^suite-sparse@5.13.0 ^openblas"
   extends: .quartz_build_and_test
 
 quartz_gcc_tpls:
@@ -69,7 +69,7 @@ quartz_gcc_tpls:
         INDEX_SPEC: [~int64]
         PRECISION_SPEC: [double]
   variables:
-    SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre +superlu-dist +lapack +klu"
+    SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre +superlu-dist +lapack +klu +petsc ^suite-sparse@5.13.0"
   extends: .quartz_build_and_test
 
 quartz_intel_tpls:
@@ -79,5 +79,5 @@ quartz_intel_tpls:
         INDEX_SPEC: [~int64]
         PRECISION_SPEC: [double]
   variables:
-    SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre ~superlu-dist +lapack +klu"
+    SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre ~superlu-dist +lapack +klu ^suite-sparse@5.13.0"
   extends: .quartz_build_and_test
diff --git a/.gitlab/radiuss-spack-configs b/.gitlab/radiuss-spack-configs
index 8194d152ac..6f22afc1ec 160000
--- a/.gitlab/radiuss-spack-configs
+++ b/.gitlab/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 8194d152acfdfcc8ad5a27051e9988f1e20e8779
+Subproject commit 6f22afc1ece86c479d2d2a64e14736ef00b632d6
diff --git a/.gitlab/spack_packages/camp/package.py b/.gitlab/spack_packages/camp/package.py
deleted file mode 100644
index aa801d21dc..0000000000
--- a/.gitlab/spack_packages/camp/package.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other
-# Spack Project Developers. See the top-level COPYRIGHT file for details.
-#
-# SPDX-License-Identifier: (Apache-2.0 OR MIT)
-
-from spack import *
-
-
-class Camp(CMakePackage, CudaPackage, ROCmPackage):
-    """
-    Compiler agnostic metaprogramming library providing concepts,
-    type operations and tuples for C++ and cuda
-    """
-
-    homepage = "https://github.com/LLNL/camp"
-    git      = "https://github.com/LLNL/camp.git"
-    url      = "https://github.com/LLNL/camp/archive/v0.1.0.tar.gz"
-
-    maintainers = ['trws']
-
-    version('main', branch='main', submodules='True')
-    version('2022.03.2', sha256='e9090d5ee191ea3a8e36b47a8fe78f3ac95d51804f1d986d931e85b8f8dad721')
-    version('2022.03.0', sha256='e9090d5ee191ea3a8e36b47a8fe78f3ac95d51804f1d986d931e85b8f8dad721')
-    version('0.3.0', sha256='129431a049ca5825443038ad5a37a86ba6d09b2618d5fe65d35f83136575afdb')
-    version('0.2.3', sha256='58a0f3bd5eadb588d7dc83f3d050aff8c8db639fc89e8d6553f9ce34fc2421a7')
-    version('0.2.2', sha256='194d38b57e50e3494482a7f94940b27f37a2bee8291f2574d64db342b981d819')
-    version('0.1.0', sha256='fd4f0f2a60b82a12a1d9f943f8893dc6fe770db493f8fae5ef6f7d0c439bebcc')
-
-    # TODO: figure out gtest dependency and then set this default True.
-    variant('tests', default=False, description='Build tests')
-
-    depends_on('cub', when='+cuda')
-
-    depends_on('blt')
-
-    def cmake_args(self):
-        spec = self.spec
-
-        options = []
-
-        options.append("-DBLT_SOURCE_DIR={0}".format(spec['blt'].prefix))
-
-        if '+cuda' in spec:
-            options.extend([
-                '-DENABLE_CUDA=ON',
-                '-DCUDA_TOOLKIT_ROOT_DIR=%s' % (spec['cuda'].prefix)])
-
-            if not spec.satisfies('cuda_arch=none'):
-                cuda_arch = spec.variants['cuda_arch'].value
-                options.append('-DCUDA_ARCH=sm_{0}'.format(cuda_arch[0]))
-                flag = '-arch sm_{0}'.format(cuda_arch[0])
-                options.append('-DCMAKE_CUDA_FLAGS:STRING={0}'.format(flag))
-        else:
-            options.append('-DENABLE_CUDA=OFF')
-
-        if '+rocm' in spec:
-            options.extend([
-                '-DENABLE_HIP=ON',
-                '-DHIP_ROOT_DIR={0}'.format(spec['hip'].prefix)
-            ])
-            archs = self.spec.variants['amdgpu_target'].value
-            if archs != 'none':
-                arch_str = ",".join(archs)
-                options.append(
-                    '-DHIP_HIPCC_FLAGS=--amdgpu-target={0}'.format(arch_str)
-                )
-        else:
-            options.append('-DENABLE_HIP=OFF')
-
-        options.append(self.define_from_variant('ENABLE_TESTS', 'tests'))
-
-        return options
\ No newline at end of file
diff --git a/.gitlab/spack_packages/netlib-lapack/ibm-xl-3.9.1.patch b/.gitlab/spack_packages/netlib-lapack/ibm-xl-3.9.1.patch
new file mode 100644
index 0000000000..c9e1707857
--- /dev/null
+++ b/.gitlab/spack_packages/netlib-lapack/ibm-xl-3.9.1.patch
@@ -0,0 +1,108 @@
+Fixes for IBM XL and Cray CCE builds:
+
+* Correct path to the fallback configuration used to handle mangling for
+  C++/Fortran compatibility (CCE, XL)
+
+* Change logic for detecting recursive fortran flags to (a) Include XL
+(qrecur), and (b) Be explicit, since not every compiler will correctly reject
+an incorrect option (ALL)
+
+NOTE: This patch has been accepted upstream
+(see https://github.com/Reference-LAPACK/lapack/pull/621)
+
+##############################################################################
+
+diff -Naur a/CBLAS/CMakeLists.txt b/CBLAS/CMakeLists.txt
+--- a/CBLAS/CMakeLists.txt	2021-03-25 12:25:15.000000000 -0600
++++ b/CBLAS/CMakeLists.txt	2021-09-01 16:27:23.561355382 -0600
+@@ -11,9 +11,7 @@
+                          MACRO_NAMESPACE "F77_"
+                          SYMBOL_NAMESPACE "F77_")
+ if(NOT FortranCInterface_GLOBAL_FOUND OR NOT FortranCInterface_MODULE_FOUND)
+-  message(WARNING "Reverting to pre-defined include/lapacke_mangling.h")
+-    configure_file(include/lapacke_mangling_with_flags.h.in
+-                  ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h)
++  message(WARNING "Reverting to pre-defined include/cblas_mangling.h")
+     configure_file(include/cblas_mangling_with_flags.h.in
+                  ${LAPACK_BINARY_DIR}/include/cblas_mangling.h)
+ endif()
+diff -Naur a/CMakeLists.txt b/CMakeLists.txt
+--- a/CMakeLists.txt	2021-03-25 12:25:15.000000000 -0600
++++ b/CMakeLists.txt	2021-09-02 09:49:18.070436958 -0600
+@@ -94,16 +94,22 @@
+ 
+ # Check if recursive flag exists
+ include(CheckFortranCompilerFlag)
+-check_fortran_compiler_flag("-recursive" _recursiveFlag)
+-check_fortran_compiler_flag("-frecursive" _frecursiveFlag)
+-check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag)
++if(CMAKE_Fortran_COMPILER_ID STREQUAL Flang)
++  check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag)
++elseif(CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
++  check_fortran_compiler_flag("-frecursive" _frecursiveFlag)
++elseif(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
++  check_fortran_compiler_flag("-recursive" _recursiveFlag)
++elseif(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
++  check_fortran_compiler_flag("-qrecur" _qrecurFlag)
++endif()
+ 
+ # Add recursive flag
+-if(_recursiveFlag)
+-  string(REGEX MATCH "-recursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
++if(_MrecursiveFlag)
++  string(REGEX MATCH "-Mrecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
+   if(NOT output_test)
+-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive"
+-      CACHE STRING "Recursive flag must be set" FORCE)
++    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive"
++    CACHE STRING "Recursive flag must be set" FORCE)
+   endif()
+ elseif(_frecursiveFlag)
+   string(REGEX MATCH "-frecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
+@@ -111,11 +117,17 @@
+     set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -frecursive"
+     CACHE STRING "Recursive flag must be set" FORCE)
+   endif()
+-elseif(_MrecursiveFlag)
+-  string(REGEX MATCH "-Mrecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
++elseif(_recursiveFlag)
++  string(REGEX MATCH "-recursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
+   if(NOT output_test)
+-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive"
+-    CACHE STRING "Recursive flag must be set" FORCE)
++    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive"
++      CACHE STRING "Recursive flag must be set" FORCE)
++  endif()
++elseif(_qrecurFlag)
++  string(REGEX MATCH "-qrecur" output_test <string> "${CMAKE_Fortran_FLAGS}")
++  if(NOT output_test)
++    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qrecur"
++      CACHE STRING "Recursive flag must be set" FORCE)
+   endif()
+ endif()
+ 
+@@ -124,7 +136,7 @@
+     set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict")
+   endif()
+   if(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
+-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none")
++    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict")
+   endif()
+ # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler.
+ # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin
+diff -Naur a/INSTALL/make.inc.XLF b/INSTALL/make.inc.XLF
+--- a/INSTALL/make.inc.XLF	2021-03-25 12:25:15.000000000 -0600
++++ b/INSTALL/make.inc.XLF	2021-09-02 09:50:02.664646455 -0600
+@@ -14,10 +14,10 @@
+ #  the compiler options desired when NO OPTIMIZATION is selected.
+ #
+ FC = xlf
+-FFLAGS = -O3 -qfixed -qnosave
++FFLAGS = -O3 -qfixed -qnosave -qrecur
+ # For -O2, add -qstrict=none
+ FFLAGS_DRV = $(FFLAGS)
+-FFLAGS_NOOPT = -O0 -qfixed -qnosave
++FFLAGS_NOOPT = -O0 -qfixed -qnosave -qrecur
+ 
+ #  Define LDFLAGS to the desired linker options for your machine.
+ #
diff --git a/.gitlab/spack_packages/netlib-lapack/ibm-xl.patch b/.gitlab/spack_packages/netlib-lapack/ibm-xl.patch
new file mode 100644
index 0000000000..52b5f19719
--- /dev/null
+++ b/.gitlab/spack_packages/netlib-lapack/ibm-xl.patch
@@ -0,0 +1,53 @@
+Fixes for IBM XL and Cray CCE builds:
+
+* Avoid optimizations that would alter program semantics by changing the
+  qstrict activation threshold from O3 to O2 (XL)
+
+* Don't assume Fortran code is all in fixed source form; disable qfixed (XL)
+
+* Correct path to the fallback configuration used to handle mangling for
+  C++/Fortran compatibility (CCE, XL)
+##############################################################################
+
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -62,7 +62,7 @@
+     set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict")
+   endif()
+   if("${CMAKE_Fortran_COMPILER}" MATCHES "xlf")
+-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none")
++    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict")
+   endif()
+ # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler.
+ # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin
+
+--- a/CMAKE/CheckLAPACKCompilerFlags.cmake
++++ b/CMAKE/CheckLAPACKCompilerFlags.cmake
+@@ -43,12 +43,6 @@
+   if( "${CMAKE_Fortran_FLAGS}" MATCHES "-qflttrap=[a-zA-Z:]:enable" )
+     set( FPE_EXIT TRUE )
+   endif()
+-
+-  if( NOT ("${CMAKE_Fortran_FLAGS}" MATCHES "-qfixed") )
+-    message( STATUS "Enabling fixed format F90/F95 with -qfixed" )
+-    set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qfixed"
+-         CACHE STRING "Flags for Fortran compiler." FORCE )
+-  endif()
+ 
+ # HP Fortran
+ elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "HP" )
+
+--- a/CBLAS/CMakeLists.txt
++++ b/CBLAS/CMakeLists.txt
+@@ -12,8 +12,8 @@
+                          SYMBOL_NAMESPACE "F77_")
+ if(NOT FortranCInterface_GLOBAL_FOUND OR NOT FortranCInterface_MODULE_FOUND)
+   message(WARNING "Reverting to pre-defined include/lapacke_mangling.h")
+-  configure_file(include/lapacke_mangling_with_flags.h.in
+-                 ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h)
++  configure_file(include/cblas_mangling_with_flags.h.in
++                 ${LAPACK_BINARY_DIR}/include/cblas_mangling.h)
+ endif()
+ 
+ include_directories(include ${LAPACK_BINARY_DIR}/include)
+
diff --git a/.gitlab/spack_packages/netlib-lapack/package.py b/.gitlab/spack_packages/netlib-lapack/package.py
new file mode 100644
index 0000000000..cb1832d061
--- /dev/null
+++ b/.gitlab/spack_packages/netlib-lapack/package.py
@@ -0,0 +1,223 @@
+# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+import spack.build_systems.cmake
+from spack.package import *
+
+
+class NetlibLapack(CMakePackage):
+    """LAPACK version 3.X is a comprehensive FORTRAN library that does
+    linear algebra operations including matrix inversions, least squared
+    solutions to linear sets of equations, eigenvector analysis, singular
+    value decomposition, etc. It is a very comprehensive and reputable
+    package that has found extensive use in the scientific community.
+
+    """
+
+    homepage = "https://www.netlib.org/lapack/"
+    url = "https://www.netlib.org/lapack/lapack-3.5.0.tgz"
+    tags = ["windows"]
+
+    version(
+        "3.10.1",
+        sha256="cd005cd021f144d7d5f7f33c943942db9f03a28d110d6a3b80d718a295f7f714",
+        url="https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v3.10.1.tar.gz",
+    )
+    version(
+        "3.10.0",
+        sha256="328c1bea493a32cac5257d84157dc686cc3ab0b004e2bea22044e0a59f6f8a19",
+        url="https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v3.10.0.tar.gz",
+    )
+    version(
+        "3.9.1",
+        sha256="d0085d2caf997ff39299c05d4bacb6f3d27001d25a4cc613d48c1f352b73e7e0",
+        url="https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v3.9.1.tar.gz",
+    )
+    version(
+        "3.9.0",
+        sha256="106087f1bb5f46afdfba7f569d0cbe23dacb9a07cd24733765a0e89dbe1ad573",
+        url="https://github.com/Reference-LAPACK/lapack/archive/v3.9.0.tar.gz",
+    )
+    version(
+        "3.8.0",
+        sha256="deb22cc4a6120bff72621155a9917f485f96ef8319ac074a7afbc68aab88bcf6",
+        url="https://www.netlib.org/lapack/lapack-3.8.0.tar.gz",
+    )
+    version("3.7.1", sha256="f6c53fd9f56932f3ddb3d5e24c1c07e4cd9b3b08e7f89de9c867125eecc9a1c8")
+    version("3.7.0", sha256="ed967e4307e986474ab02eb810eed1d1adc73f5e1e3bc78fb009f6fe766db3be")
+    version("3.6.1", sha256="888a50d787a9d828074db581c80b2d22bdb91435a673b1bf6cd6eb51aa50d1de")
+    version("3.6.0", sha256="a9a0082c918fe14e377bbd570057616768dca76cbdc713457d8199aaa233ffc3")
+    version("3.5.0", sha256="9ad8f0d3f3fb5521db49f2dd716463b8fb2b6bc9dc386a9956b8c6144f726352")
+    version("3.4.2", sha256="60a65daaf16ec315034675942618a2230521ea7adf85eea788ee54841072faf0")
+    version("3.4.1", sha256="93b910f94f6091a2e71b59809c4db4a14655db527cfc5821ade2e8c8ab75380f")
+    version("3.4.0", sha256="a7139ef97004d0e3c4c30f1c52d508fd7ae84b5fbaf0dd8e792c167dc306c3e9")
+    version("3.3.1", sha256="56821ab51c29369a34e5085728f92c549a9aa926f26acf7eeac87b61eed329e4")
+
+    # netlib-lapack is the reference implementation of LAPACK
+    for ver in [
+        "3.10.1",
+        "3.10.0",
+        "3.9.1",
+        "3.9.0",
+        "3.8.0",
+        "3.7.1",
+        "3.7.0",
+        "3.6.1",
+        "3.6.0",
+        "3.5.0",
+        "3.4.2",
+        "3.4.1",
+        "3.4.0",
+        "3.3.1",
+    ]:
+        provides("lapack@" + ver, when="@" + ver)
+
+    variant("shared", default=True, description="Build shared library version")
+    variant("external-blas", default=False, description="Build lapack with an external blas")
+
+    variant("lapacke", default=True, description="Activates the build of the LAPACKE C interface")
+    variant("xblas", default=False, description="Builds extended precision routines using XBLAS")
+
+    # Fixes for IBM XL and Cray CCE builds:
+    #   Avoid optimizations that alter program semantics
+    #   Don't assume fixed source form for Fortran
+    #   Correct path to mangling config
+    patch("ibm-xl.patch", when="@3.7:3.8 %xl")
+    patch("ibm-xl.patch", when="@3.7:3.8 %xl_r")
+    patch("ibm-xl.patch", when="@3.7:3.8 %cce@9:")
+
+    # https://github.com/Reference-LAPACK/lapack/pull/621
+    # Fixes for IBM XL and Cray CCE builds:
+    #   Correct path to mangling config
+    #   Fix logic for detecting recursive Fortran flags
+    patch("ibm-xl-3.9.1.patch", when="@3.9.1 %xl")
+    patch("ibm-xl-3.9.1.patch", when="@3.9.1 %xl_r")
+    patch("ibm-xl-3.9.1.patch", when="@3.9.1 %cce@13:")
+
+    # https://github.com/Reference-LAPACK/lapack/issues/228
+    patch("undefined_declarations.patch", when="@3.8.0:3.8")
+
+    # https://github.com/Reference-LAPACK/lapack/pull/268
+    patch("testing.patch", when="@3.7.0:3.8")
+
+    # virtual dependency
+    provides("blas", when="~external-blas")
+    provides("lapack")
+
+    depends_on("blas", when="+external-blas")
+    depends_on("netlib-xblas+fortran+plain_blas", when="+xblas")
+    depends_on("python@2.7:", type="test")
+
+    # We need to run every phase twice in order to get static and shared
+    # versions of the libraries. When ~shared, we run the default
+    # implementations of the CMakePackage's phases and get only one building
+    # directory 'spack-build-static' with -DBUILD_SHARED_LIBS:BOOL=OFF (see
+    # implementations of self.build_directory and self.cmake_args() below).
+    # When +shared, we run the overridden methods for the phases, each
+    # running the default implementation twice with different values for
+    # self._building_shared. As a result, we get two building directories:
+    # 'spack-build-static' with -DBUILD_SHARED_LIBS:BOOL=OFF and
+    # 'spack-build-shared' with -DBUILD_SHARED_LIBS:BOOL=ON.
+    _building_shared = False
+
+    def patch(self):
+        # Fix cblas CMakeLists.txt -- has wrong case for subdirectory name.
+        if self.spec.satisfies("@3.6.0:"):
+            filter_file(
+                "${CMAKE_CURRENT_SOURCE_DIR}/CMAKE/",
+                "${CMAKE_CURRENT_SOURCE_DIR}/cmake/",
+                "CBLAS/CMakeLists.txt",
+                string=True,
+            )
+
+        # Remove duplicate header file that gets generated during CMake shared
+        # builds: https://github.com/Reference-LAPACK/lapack/issues/583
+        if self.spec.satisfies("platform=windows @0:3.9.1"):
+            force_remove("LAPACKE/include/lapacke_mangling.h")
+
+    @property
+    def blas_libs(self):
+        shared = True if "+shared" in self.spec else False
+        query_parameters = self.spec.last_query.extra_parameters
+        query2libraries = {
+            tuple(): ["libblas"],
+            ("c", "fortran"): ["libcblas", "libblas"],
+            ("c",): ["libcblas"],
+            ("fortran",): ["libblas"],
+        }
+        key = tuple(sorted(query_parameters))
+        libraries = query2libraries[key]
+        return find_libraries(libraries, root=self.prefix, shared=shared, recursive=True)
+
+    @property
+    def lapack_libs(self):
+        shared = True if "+shared" in self.spec else False
+        query_parameters = self.spec.last_query.extra_parameters
+        query2libraries = {
+            tuple(): ["liblapack"],
+            ("c", "fortran"): ["liblapacke", "liblapack"],
+            ("c",): ["liblapacke"],
+            ("fortran",): ["liblapack"],
+        }
+        key = tuple(sorted(query_parameters))
+        libraries = query2libraries[key]
+        return find_libraries(libraries, root=self.prefix, shared=shared, recursive=True)
+
+    @property
+    def headers(self):
+        include_dir = self.spec.prefix.include
+        cblas_h = join_path(include_dir, "cblas.h")
+        lapacke_h = join_path(include_dir, "lapacke.h")
+        return HeaderList([cblas_h, lapacke_h])
+
+
+class CMakeBuilder(spack.build_systems.cmake.CMakeBuilder):
+    def cmake_args(self):
+        args = [
+            self.define_from_variant("BUILD_SHARED_LIBS", "shared"),
+            self.define_from_variant("LAPACKE", "lapacke"),
+            self.define_from_variant("LAPACKE_WITH_TMG", "lapacke"),
+            self.define("CBLAS", self.spec.satisfies("@3.6.0:")),
+        ]
+
+        if self.spec.satisfies("%intel"):
+            # Intel compiler finds serious syntax issues when trying to
+            # build CBLAS and LapackE
+            args.extend([self.define("CBLAS", False), self.define("LAPACKE", False)])
+
+        if self.spec.satisfies("%xl") or self.spec.satisfies("%xl_r"):
+            # use F77 compiler if IBM XL
+            args.extend(
+                [
+                    self.define("CMAKE_Fortran_COMPILER", self.pkg.compiler.f77),
+                    self.define(
+                        "CMAKE_Fortran_FLAGS",
+                        " ".join(self.spec.compiler_flags["fflags"]) + " -O3 -qnohot",
+                    ),
+                ]
+            )
+
+        # deprecated routines are commonly needed by, for example, suitesparse
+        # Note that OpenBLAS spack is built with deprecated routines
+        args.append(self.define("BUILD_DEPRECATED", True))
+
+        if self.spec.satisfies("+external-blas"):
+            args.extend(
+                [
+                    self.define("USE_OPTIMIZED_BLAS", True),
+                    self.define("BLAS_LIBRARIES:PATH", self.spec["blas"].libs.joined(";")),
+                ]
+            )
+
+        if self.spec.satisfies("+xblas"):
+            args.extend(
+                [
+                    self.define("XBLAS_INCLUDE_DIR", self.spec["netlib-xblas"].prefix.include),
+                    self.define("XBLAS_LIBRARY", self.spec["netlib-xblas"].libs.joined(";")),
+                ]
+            )
+
+        args.append(self.define("BUILD_TESTING", self.pkg.run_tests))
+
+        return args
diff --git a/.gitlab/spack_packages/netlib-lapack/testing.patch b/.gitlab/spack_packages/netlib-lapack/testing.patch
new file mode 100644
index 0000000000..fce18548c4
--- /dev/null
+++ b/.gitlab/spack_packages/netlib-lapack/testing.patch
@@ -0,0 +1,13 @@
+diff --git a/TESTING/LIN/alahd.f b/TESTING/LIN/alahd.f
+index 8f4cd58d..6a4946e0 100644
+--- a/TESTING/LIN/alahd.f
++++ b/TESTING/LIN/alahd.f
+@@ -1036,7 +1036,7 @@
+  9929 FORMAT( ' Test ratios (1-3: ', A1, 'TZRZF):' )
+  9920 FORMAT( 3X, ' 7-10: same as 3-6', 3X, ' 11-14: same as 3-6' )
+  9921 FORMAT( ' Test ratios:', / '    (1-2: ', A1, 'GELS, 3-6: ', A1,
+-     $      'GELSY, 7-10: ', A1, 'GELSS, 11-14: ', A1, 'GELSD, 15-16: '
++     $      'GELSY, 7-10: ', A1, 'GELSS, 11-14: ', A1, 'GELSD, 15-16: ',
+      $        A1, 'GETSLS)')
+  9928 FORMAT( 7X, 'where ALPHA = ( 1 + SQRT( 17 ) ) / 8' )
+  9927 FORMAT( 3X, I2, ': ABS( Largest element in L )', / 12X,
diff --git a/.gitlab/spack_packages/netlib-lapack/undefined_declarations.patch b/.gitlab/spack_packages/netlib-lapack/undefined_declarations.patch
new file mode 100644
index 0000000000..9dac2562f7
--- /dev/null
+++ b/.gitlab/spack_packages/netlib-lapack/undefined_declarations.patch
@@ -0,0 +1,26 @@
+diff --git a/SRC/dsytrf_aa_2stage.f b/SRC/dsytrf_aa_2stage.f
+index 2991305..f5f06cc 100644
+--- a/SRC/dsytrf_aa_2stage.f
++++ b/SRC/dsytrf_aa_2stage.f
+@@ -191,7 +191,7 @@
+       EXTERNAL           LSAME, ILAENV
+ *     ..
+ *     .. External Subroutines ..
+-      EXTERNAL           XERBLA, DCOPY, DLACGV, DLACPY,
++      EXTERNAL           XERBLA, DCOPY, DLACPY,
+      $                   DLASET, DGBTRF, DGEMM,  DGETRF, 
+      $                   DSYGST, DSWAP, DTRSM 
+ *     ..
+diff --git a/SRC/ssytrf_aa_2stage.f b/SRC/ssytrf_aa_2stage.f
+index be6809d..a929749 100644
+--- a/SRC/ssytrf_aa_2stage.f
++++ b/SRC/ssytrf_aa_2stage.f
+@@ -191,7 +191,7 @@
+       EXTERNAL           LSAME, ILAENV
+ *     ..
+ *     .. External Subroutines ..
+-      EXTERNAL           XERBLA, SCOPY, SLACGV, SLACPY,
++      EXTERNAL           XERBLA, SCOPY, SLACPY,
+      $                   SLASET, SGBTRF, SGEMM,  SGETRF, 
+      $                   SSYGST, SSWAP, STRSM 
+ *     ..
diff --git a/.gitlab/spack_packages/raja/package.py b/.gitlab/spack_packages/raja/package.py
deleted file mode 100644
index db49267ed1..0000000000
--- a/.gitlab/spack_packages/raja/package.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other
-# Spack Project Developers. See the top-level COPYRIGHT file for details.
-#
-# SPDX-License-Identifier: (Apache-2.0 OR MIT)
-
-import socket
-
-from spack import *
-
-
-class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
-    """RAJA Parallel Framework."""
-
-    homepage = "https://software.llnl.gov/RAJA/"
-    git      = "https://github.com/LLNL/RAJA.git"
-    tags     = ['radiuss', 'e4s']
-
-    maintainers = ['davidbeckingsale']
-
-    version('develop', branch='develop', submodules=False)
-    version('main',  branch='main',  submodules=False)
-    version('2022.03.1', tag='v2022.03.0', submodules=False)
-    version('2022.03.0', tag='v2022.03.0', submodules=False)
-    version('0.14.0', tag='v0.14.0', submodules='True')
-    version('0.13.0', tag='v0.13.0', submodules='True')
-    version('0.12.1', tag='v0.12.1', submodules="True")
-    version('0.12.0', tag='v0.12.0', submodules="True")
-    version('0.11.0', tag='v0.11.0', submodules="True")
-    version('0.10.1', tag='v0.10.1', submodules="True")
-    version('0.10.0', tag='v0.10.0', submodules="True")
-    version('0.9.0', tag='v0.9.0', submodules="True")
-    version('0.8.0', tag='v0.8.0', submodules="True")
-    version('0.7.0', tag='v0.7.0', submodules="True")
-    version('0.6.0', tag='v0.6.0', submodules="True")
-    version('0.5.3', tag='v0.5.3', submodules="True")
-    version('0.5.2', tag='v0.5.2', submodules="True")
-    version('0.5.1', tag='v0.5.1', submodules="True")
-    version('0.5.0', tag='v0.5.0', submodules="True")
-    version('0.4.1', tag='v0.4.1', submodules="True")
-    version('0.4.0', tag='v0.4.0', submodules="True")
-
-    # export targets when building pre-2.4.0 release with BLT 0.4.0+
-    patch('https://github.com/LLNL/RAJA/commit/eca1124ee4af380d6613adc6012c307d1fd4176b.patch?full_index=1',
-          sha256='12bb78c00b6683ad3e7fd4e3f87f9776bae074b722431b79696bc862816735ef',
-          when='@:0.13.0 ^blt@0.4:')
-
-    variant('openmp', default=True, description='Build OpenMP backend')
-    variant('shared', default=True, description='Build Shared Libs')
-    variant('examples', default=True, description='Build examples.')
-    variant('exercises', default=True, description='Build exercises.')
-    # TODO: figure out gtest dependency and then set this default True
-    # and remove the +tests conflict below.
-    variant('tests', default=False, description='Build tests')
-
-    depends_on('blt')
-    depends_on('blt@0.5.0:', type='build', when='@0.14.1:')
-    depends_on('blt@0.4.1', type='build', when='@0.14.0')
-    depends_on('blt@0.4.0:', type='build', when='@0.13.0')
-    depends_on('blt@0.3.6:', type='build', when='@:0.12.0')
-
-    depends_on('camp@0.2.2', when='@0.14.0')
-    depends_on('camp@0.1.0', when='@0.10.0:0.13.0')
-    depends_on('camp@2022.03.0:', when='@2022.03.0:')
-
-    depends_on('cmake@:3.20', when='+rocm', type='build')
-    depends_on('cmake@3.14:', when='@2022.03.0:')
-
-    with when('+rocm @0.12.0:'):
-        depends_on('camp+rocm')
-        for arch in ROCmPackage.amdgpu_targets:
-            depends_on('camp+rocm amdgpu_target={0}'.format(arch),
-                       when='amdgpu_target={0}'.format(arch))
-        conflicts('+openmp')
-
-    with when('+cuda @0.12.0:'):
-        depends_on('camp+cuda')
-        for sm_ in CudaPackage.cuda_arch_values:
-            depends_on('camp +cuda cuda_arch={0}'.format(sm_),
-                       when='cuda_arch={0}'.format(sm_))
-
-    def _get_sys_type(self, spec):
-        sys_type = spec.architecture
-        if "SYS_TYPE" in env:
-            sys_type = env["SYS_TYPE"]
-        return sys_type
-
-    @property
-    def cache_name(self):
-        hostname = socket.gethostname()
-        if "SYS_TYPE" in env:
-            hostname = hostname.rstrip('1234567890')
-        return "{0}-{1}-{2}@{3}.cmake".format(
-            hostname,
-            self._get_sys_type(self.spec),
-            self.spec.compiler.name,
-            self.spec.compiler.version
-        )
-
-    def initconfig_hardware_entries(self):
-        spec = self.spec
-        entries = super(Raja, self).initconfig_hardware_entries()
-
-        entries.append(cmake_cache_option("ENABLE_OPENMP", '+openmp' in spec))
-
-        if '+cuda' in spec:
-            entries.append(cmake_cache_option("ENABLE_CUDA", True))
-
-            if not spec.satisfies('cuda_arch=none'):
-                cuda_arch = spec.variants['cuda_arch'].value
-                entries.append(cmake_cache_string(
-                    "CUDA_ARCH", 'sm_{0}'.format(cuda_arch[0])))
-                entries.append(cmake_cache_string(
-                    "CMAKE_CUDA_ARCHITECTURES", '{0}'.format(cuda_arch[0])))
-        else:
-            entries.append(cmake_cache_option("ENABLE_CUDA", False))
-
-        if '+rocm' in spec:
-            entries.append(cmake_cache_option("ENABLE_HIP", True))
-            entries.append(cmake_cache_path(
-                "HIP_ROOT_DIR", '{0}'.format(spec['hip'].prefix)))
-            archs = self.spec.variants['amdgpu_target'].value
-            if archs != 'none':
-                arch_str = ",".join(archs)
-                entries.append(cmake_cache_string(
-                    "HIP_HIPCC_FLAGS", '--amdgpu-target={0}'.format(arch_str)))
-        else:
-            entries.append(cmake_cache_option("ENABLE_HIP", False))
-
-        return entries
-
-    def initconfig_package_entries(self):
-        spec = self.spec
-        entries = []
-
-        option_prefix = "RAJA_" if spec.satisfies("@2022.03.0:") else ""
-
-        entries.append(cmake_cache_path("BLT_SOURCE_DIR", spec['blt'].prefix))
-        if 'camp' in self.spec:
-            entries.append(cmake_cache_path("camp_DIR", spec['camp'].prefix))
-        entries.append(cmake_cache_option("BUILD_SHARED_LIBS", '+shared' in spec))
-        entries.append(cmake_cache_option(
-            "{}ENABLE_EXAMPLES".format(option_prefix), '+examples' in spec))
-        if spec.satisfies('@0.14.0:'):
-            entries.append(cmake_cache_option(
-                "{}ENABLE_EXERCISES".format(option_prefix), '+exercises' in spec))
-        else:
-            entries.append(cmake_cache_option("ENABLE_EXERCISES",
-                                              '+exercises' in spec))
-
-        # Work around spack adding -march=ppc64le to SPACK_TARGET_ARGS which
-        # is used by the spack compiler wrapper.  This can go away when BLT
-        # removes -Werror from GTest flags
-        if self.spec.satisfies('%clang target=ppc64le:') or not self.run_tests:
-            entries.append(cmake_cache_option("ENABLE_TESTS", False))
-        else:
-            entries.append(cmake_cache_option("ENABLE_TESTS", True))
-
-        return entries
-
-    def cmake_args(self):
-        options = []
-        return options
-
-    @property
-    def build_relpath(self):
-        """Relative path to the cmake build subdirectory."""
-        return join_path('..', self.build_dirname)
-
-    @run_after('install')
-    def setup_build_tests(self):
-        """Copy the build test files after the package is installed to a
-        relative install test subdirectory for use during `spack test run`."""
-        # Now copy the relative files
-        self.cache_extra_test_sources(self.build_relpath)
-
-        # Ensure the path exists since relying on a relative path at the
-        # same level as the normal stage source path.
-        mkdirp(self.install_test_root)
-
-    @property
-    def _extra_tests_path(self):
-        # TODO: The tests should be converted to re-build and run examples
-        # TODO: using the installed libraries.
-        return join_path(self.install_test_root, self.build_relpath, 'bin')
-
-    def _test_examples(self):
-        """Perform very basic checks on a subset of copied examples."""
-        checks = [
-            ('ex5_line-of-sight_solution',
-             [r'RAJA sequential', r'RAJA OpenMP', r'result -- PASS']),
-            ('ex6_stencil-offset-layout_solution',
-             [r'RAJA Views \(permuted\)', r'result -- PASS']),
-            ('ex8_tiled-matrix-transpose_solution',
-             [r'parallel top inner loop',
-              r'collapsed inner loops', r'result -- PASS']),
-            ('kernel-dynamic-tile', [r'Running index', r'(24,24)']),
-            ('plugin-example',
-             [r'Launching host kernel for the 10 time']),
-            ('tut_batched-matrix-multiply', [r'result -- PASS']),
-            ('wave-eqn', [r'Max Error = 2', r'Evolved solution to time'])
-        ]
-        for exe, expected in checks:
-            reason = 'test: checking output of {0} for {1}' \
-                .format(exe, expected)
-            self.run_test(exe, [], expected, installed=False,
-                          purpose=reason, skip_missing=True,
-                          work_dir=self._extra_tests_path)
-
-    def test(self):
-        """Perform smoke tests."""
-        self._test_examples()
\ No newline at end of file
diff --git a/.gitlab/spack_packages/sundials/package.py b/.gitlab/spack_packages/sundials/package.py
index 4de41139de..ad7ee0cf3c 100644
--- a/.gitlab/spack_packages/sundials/package.py
+++ b/.gitlab/spack_packages/sundials/package.py
@@ -27,7 +27,10 @@ class Sundials(CachedCMakePackage, CudaPackage, ROCmPackage):
     # Versions
     # ==========================================================================
     version("develop", branch="develop")
-    version("6.4.0", branch="develop")
+    version("6.5.1", sha256="4252303805171e4dbdd19a01e52c1dcfe0dafc599c3cfedb0a5c2ffb045a8a75")
+    version("6.5.0", sha256="4e0b998dff292a2617e179609b539b511eb80836f5faacf800e688a886288502")
+    version("6.4.1", sha256="7bf10a8d2920591af3fba2db92548e91ad60eb7241ab23350a9b1bc51e05e8d0")
+    version("6.4.0", sha256="0aff803a12c6d298d05b56839197dd09858631864017e255ed89e28b49b652f1")
     version("6.3.0", sha256="89a22bea820ff250aa7239f634ab07fa34efe1d2dcfde29cc8d3af11455ba2a7")
     version("6.2.0", sha256="195d5593772fc483f63f08794d79e4bab30c2ec58e6ce4b0fb6bcc0e0c48f31d")
     version("6.1.1", sha256="cfaf637b792c330396a25ef787eb59d58726c35918ebbc08e33466e45d50470c")
@@ -123,9 +126,17 @@ class Sundials(CachedCMakePackage, CudaPackage, ROCmPackage):
         when="@6.0.0: +profiling",
         description="Enable Caliper instrumentation/profiling",
     )
+    variant("ginkgo", default=False, when="@6.4.0:", description="Enable Ginkgo interfaces")
     variant("hypre", default=False, when="@2.7.0:", description="Enable Hypre MPI parallel vector")
-    variant("lapack", default=False, description="Enable LAPACK direct solvers")
+    variant("kokkos", default=False, when="@6.4.0:", description="Enable Kokkos vector")
+    variant(
+        "kokkos-kernels",
+        default=False,
+        when="@6.4.0:",
+        description="Enable KokkosKernels based matrix and linear solver",
+    )
     variant("klu", default=False, description="Enable KLU sparse, direct solver")
+    variant("lapack", default=False, description="Enable LAPACK direct solvers")
     variant("petsc", default=False, when="@2.7.0:", description="Enable PETSc interfaces")
     variant("magma", default=False, when="@5.7.0:", description="Enable MAGMA interface")
     variant("superlu-mt", default=False, description="Enable SuperLU_MT sparse, direct solver")
@@ -169,6 +180,9 @@ class Sundials(CachedCMakePackage, CudaPackage, ROCmPackage):
         "profiling", default=False, when="@6.0.0:", description="Build with profiling capabilities"
     )
 
+    # Scheduler
+    variant("scheduler", default="slurm", description="Specify which scheduler the system runs on.", values=("flux", "lsf", "slurm"))
+
     # ==========================================================================
     # Dependencies
     # ==========================================================================
@@ -190,6 +204,23 @@ class Sundials(CachedCMakePackage, CudaPackage, ROCmPackage):
 
     # External libraries
     depends_on("caliper", when="+caliper")
+    depends_on("ginkgo@1.5.0:", when="+ginkgo")
+    depends_on("kokkos", when="+kokkos")
+    depends_on("kokkos-kernels", when="+kokkos-kernels")
+    for cuda_arch in CudaPackage.cuda_arch_values:
+        depends_on(
+            "kokkos+cuda+cuda_lambda+cuda_constexpr cuda_arch=%s" % cuda_arch,
+            when="+kokkos +cuda cuda_arch=%s" % cuda_arch,
+        )
+        depends_on(
+            "kokkos-kernels+cuda cuda_arch=%s" % cuda_arch,
+            when="+kokkos-kernels +cuda cuda_arch=%s" % cuda_arch,
+        )
+    for rocm_arch in ROCmPackage.amdgpu_targets:
+        depends_on(
+            "kokkos+rocm amdgpu_target=%s" % rocm_arch,
+            when="+kokkos +rocm amdgpu_target=%s" % rocm_arch,
+        )
     depends_on("lapack", when="+lapack")
     depends_on("hypre+mpi@2.22.1:", when="@5.7.1: +hypre")
     depends_on("hypre+mpi@:2.22.0", when="@:5.7.0 +hypre")
@@ -205,13 +236,13 @@ class Sundials(CachedCMakePackage, CudaPackage, ROCmPackage):
     # Require that external libraries built with the same precision
     depends_on("petsc~double~complex", when="+petsc precision=single")
     depends_on("petsc+double~complex", when="+petsc precision=double")
-    
+
     # Require that external libraries built with the same index type
     with when('+int64'):
         depends_on("hypre+mpi+int64", when="+hypre +int64")
         depends_on("petsc+int64", when="+petsc +int64")
         depends_on("superlu-dist+int64", when="+superlu-dist +int64")
-    
+
     with when('~int64'):
         depends_on("hypre+mpi~int64", when="+hypre ~int64")
         depends_on("petsc~int64", when="+petsc ~int64")
@@ -640,6 +671,13 @@ def initconfig_mpi_entries(self):
                     cmake_cache_path("MPI_MPIF90", spec["mpi"].mpifc)
                 ]
             )
+            if "scheduler=flux" in spec:
+                entries.append(cmake_cache_string("SUNDIALS_TEST_MPIRUN_COMMAND", "flux run"))
+            if "scheduler=slurm" in spec:
+                entries.append(cmake_cache_string("SUNDIALS_TEST_MPIRUN_COMMAND", "srun"))
+            if "scheduler=lsf" in spec:
+                entries.append(cmake_cache_string("SUNDIALS_TEST_MPIRUN_COMMAND", "jsrun"))
+                
 
         return entries
 
@@ -666,10 +704,9 @@ def initconfig_hardware_entries(self):
                     cmake_cache_path("HIP_PATH", spec["hip"].prefix),
                     cmake_cache_path("HIP_CLANG_INCLUDE_PATH", spec["llvm-amdgpu"].prefix.include),
                     cmake_cache_path("ROCM_PATH", spec["llvm-amdgpu"].prefix),
-                    cmake_cache_string("AMDGPU_TARGETS", spec.variants["amdgpu_target"].value)
+                    cmake_cache_string("AMDGPU_TARGETS", ";".join(spec.variants["amdgpu_target"].value))
                 ]
             )
-
         return entries
 
     def initconfig_package_entries(self):
@@ -735,6 +772,25 @@ def initconfig_package_entries(self):
         if "+caliper" in spec:
             entries.append(cmake_cache_path("CALIPER_DIR", spec["caliper"].prefix))
 
+        # Building with Ginkgo
+        if "+ginkgo" in spec:
+            gko_backends = ["REF"]
+            if "+openmp" in spec["ginkgo"] and "+openmp" in spec:
+                gko_backends.append("OMP")
+            if "+cuda" in spec["ginkgo"] and "+cuda" in spec:
+                gko_backends.append("CUDA")
+            if "+rocm" in spec["ginkgo"] and "+rocm" in spec:
+                gko_backends.append("HIP")
+            if "+oneapi" in spec["ginkgo"] and "+sycl" in spec:
+                gko_backends.append("DPCPP")
+            entries.extend(
+                [
+                    self.cache_option_from_variant("ENABLE_GINKGO", "ginkgo"),
+                    cmake_cache_path("Ginkgo_DIR", spec["ginkgo"].prefix),
+                    cmake_cache_string("SUNDIALS_GINKGO_BACKENDS", ";".join(gko_backends)),
+                ]
+            )
+
         # Building with Hypre
         if "+hypre" in spec:
             entries.extend(
@@ -747,6 +803,12 @@ def initconfig_package_entries(self):
                 hypre_libs = spec["blas"].libs + spec["lapack"].libs
                 entries.extend([cmake_cache_string("HYPRE_LIBRARIES", hypre_libs.joined(";"))])
 
+        # Building with Kokkos and KokkosKernels
+        if "+kokkos" in spec:
+            entries.extend([self.cache_option_from_variant("Kokkos_DIR", spec["kokkos"].prefix)])
+        if "+kokkos-kernels" in spec:
+            entries.extend([self.cache_option_from_variant("KokkosKernels_DIR", spec["kokkos-kernels"].prefix)])
+
         # Building with KLU
         if "+klu" in spec:
             entries.extend(
@@ -788,23 +850,30 @@ def initconfig_package_entries(self):
             entries.append(cmake_cache_path("RAJA_DIR", spec["raja"].prefix))
             if "camp" in spec:
                 entries.append(cmake_cache_path("camp_DIR", spec["camp"].prefix.lib.cmake + '/camp'))
+            if "+rocm" in spec:
+                entries.append(cmake_cache_string("SUNDIALS_RAJA_BACKENDS", "HIP"))
 
         # Building with SuperLU_DIST
         if "+superlu-dist" in spec:
-            if spec.satisfies("@6.4.0:"):
+            #if spec.satisfies("@6.4.0:"):
+            if False:
                 entries.extend(
                     [
                         cmake_cache_path("SUPERLUDIST_DIR", spec["superlu-dist"].prefix),
-                        cmake_cache_string("SUPERLUDIST_OpenMP", "^superlu-dist+openmp" in spec), 
+                        cmake_cache_string("SUPERLUDIST_OpenMP", "^superlu-dist+openmp" in spec),
                     ]
                 )
             else:
+                superludist_libs = []
+                superludist_libs.extend(spec["parmetis"].libs)
+                superludist_libs.extend(spec["metis"].libs)
+                superludist_libs.extend(spec["superlu-dist"].libs)
                 entries.extend(
                     [
                         cmake_cache_path("SUPERLUDIST_INCLUDE_DIR", spec["superlu-dist"].prefix.include),
                         cmake_cache_path("SUPERLUDIST_LIBRARY_DIR", spec["superlu-dist"].prefix.lib),
-                        cmake_cache_string("SUPERLUDIST_LIBRARIES", spec["superlu-dist"].libs),
-                        cmake_cache_string("SUPERLUDIST_OpenMP", "^superlu-dist+openmp" in spec), 
+                        cmake_cache_string("SUPERLUDIST_LIBRARIES", ";".join(superludist_libs)),
+                        cmake_cache_string("SUPERLUDIST_OpenMP", "^superlu-dist+openmp" in spec),
                     ]
                 )
 
diff --git a/.gitlab/uberenv b/.gitlab/uberenv
index 4941c237ee..0d00dc8e19 160000
--- a/.gitlab/uberenv
+++ b/.gitlab/uberenv
@@ -1 +1 @@
-Subproject commit 4941c237eec514d6d68872243efb9f4af8843f4d
+Subproject commit 0d00dc8e19a889ba07ae433590b87533c4b5b3da
diff --git a/.uberenv_config.json b/.uberenv_config.json
index 7c65ba8c6f..67618cca2f 100644
--- a/.uberenv_config.json
+++ b/.uberenv_config.json
@@ -4,8 +4,8 @@
   "package_final_phase": "initconfig",
   "package_source_dir": "../..",
   "spack_url": "https://github.com/spack/spack",
-  "spack_commit": "13e6f87ef6527954b152eaea303841978e83b992",
+  "spack_commit": "5e0d2107348eed6cbe6deca43a30f5b06c5e40af",
   "spack_activate": {},
   "spack_configs_path": ".gitlab/radiuss-spack-configs",
   "spack_packages_path": ".gitlab/spack_packages"
-}
\ No newline at end of file
+}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 44eb259cbf..28a06f737a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # SUNDIALS Changelog
 
+## Changes to SUNDIALS in release 6.6.0
+
+Added the second order IMEX method from Giraldo, Kelly, and Constantinescu 2013
+as the default second order IMEX method in ARKStep. The explicit table is given
+by `ARKODE_ARK2_ERK_3_1_2` and the implicit table by `ARKODE_ARK2_DIRK_3_1_2`.
+
+Updated the F2003 utility routines `SUNDIALSFileOpen` and `SUNDIALSFileClose`
+to support user specification of `stdout` and `stderr` strings for the output
+file names.
+
 ## Changes to SUNDIALS in release 6.5.1
 
 Added the functions `ARKStepClearStopTime`, `ERKStepClearStopTime`,
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 793ffbb0a4..e9fd4648c1 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -26,12 +26,9 @@ sundials_option(BENCHMARK_NVECTOR BOOL "NVector benchmarks are on" ON)
 # Add specific benchmarks
 #----------------------------------------
 
-if(ENABLE_MPI AND ENABLE_RAJA)
-  add_subdirectory(advection_reaction_3D)
-endif()
-
 if(ENABLE_MPI)
-  add_subdirectory(diffusion_2D)
+add_subdirectory(diffusion_2D)
+add_subdirectory(advection_reaction_3D)
 endif()
 
 # Add the nvector benchmarks
diff --git a/benchmarks/advection_reaction_3D/CMakeLists.txt b/benchmarks/advection_reaction_3D/CMakeLists.txt
index e51a95155a..7469a6a10a 100644
--- a/benchmarks/advection_reaction_3D/CMakeLists.txt
+++ b/benchmarks/advection_reaction_3D/CMakeLists.txt
@@ -1,5 +1,5 @@
 # ---------------------------------------------------------------
-# Programmer(s):  Cody J. Balos @ LLNL
+# Programmer(s): Daniel R. Reynolds @ SMU
 # ---------------------------------------------------------------
 # SUNDIALS Copyright Start
 # Copyright (c) 2002-2023, Lawrence Livermore National Security
@@ -12,135 +12,10 @@
 # SUNDIALS Copyright End
 # ---------------------------------------------------------------
 
-if(BUILD_ARKODE AND BUILD_CVODE AND BUILD_IDA)
-
-  if((RAJA_BACKENDS MATCHES "TARGET_OPENMP") OR (RAJA_BACKENDS MATCHES "OPENMP"))
-    set(OTHER_LIBS OpenMP::OpenMP_CXX)
-  endif()
-
-  # ----------------------------------------------------------------------------
-  # MPI only
-  # ----------------------------------------------------------------------------
-
-  add_executable(advection_reaction_3D
-    advection_reaction_3D.cpp
-    arkode_driver.cpp
-    cvode_driver.cpp
-    ida_driver.cpp
-    rhs3D.hpp
-    ParallelGrid.hpp
-    backends.hpp)
-
-  # ensure the linker language is reset to CXX
-  set_target_properties(advection_reaction_3D PROPERTIES LINKER_LANGUAGE CXX)
-
-  target_include_directories(advection_reaction_3D
-    PRIVATE
-    ${PROJECT_SOURCE_DIR}/utilities
-    ${MPI_CXX_INCLUDE_DIRS})
-
-  target_link_libraries(advection_reaction_3D
-    PRIVATE
-    sundials_arkode
-    sundials_cvode
-    sundials_ida
-    sundials_nvecmpiplusx
-    sundials_nvecserial
-    RAJA
-    ${MPI_CXX_LIBRARIES}
-    ${OTHER_LIBS})
-
-  install(TARGETS advection_reaction_3D
-    DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D")
-
-  install(FILES README.md
-    DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D")
-
-  # ----------------------------------------------------------------------------
-  # MPI + CUDA
-  # ----------------------------------------------------------------------------
-
-  if(BUILD_NVECTOR_CUDA)
-
-    set_source_files_properties(advection_reaction_3D.cpp
-      PROPERTIES LANGUAGE CUDA)
-    set_source_files_properties(arkode_driver.cpp PROPERTIES LANGUAGE CUDA)
-    set_source_files_properties(cvode_driver.cpp PROPERTIES LANGUAGE CUDA)
-    set_source_files_properties(ida_driver.cpp PROPERTIES LANGUAGE CUDA)
-
-    add_executable(advection_reaction_3D_mpicuda
-      advection_reaction_3D.cpp
-      arkode_driver.cpp
-      cvode_driver.cpp
-      ida_driver.cpp
-      rhs3D.hpp
-      ParallelGrid.hpp
-      backends.hpp)
-
-    # ensure the linker language is reset to CXX
-    set_target_properties(advection_reaction_3D_mpicuda
-      PROPERTIES LINKER_LANGUAGE CXX)
-
-    target_include_directories(advection_reaction_3D_mpicuda
-      PRIVATE
-      ${PROJECT_SOURCE_DIR}/utilities
-      ${MPI_CXX_INCLUDE_DIRS})
-
-    target_link_libraries(advection_reaction_3D_mpicuda
-      PRIVATE
-      sundials_arkode
-      sundials_cvode
-      sundials_ida
-      sundials_nvecmpiplusx
-      sundials_nveccuda
-      RAJA
-      ${MPI_CXX_LIBRARIES}
-      ${OTHER_LIBS})
-
-    target_compile_definitions(advection_reaction_3D_mpicuda PRIVATE USE_CUDA_NVEC)
-
-    install(TARGETS advection_reaction_3D_mpicuda
-      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D")
-
-  endif()
-
-  # ----------------------------------------------------------------------------
-  # MPI + HIP
-  # ----------------------------------------------------------------------------
-
-  if(BUILD_NVECTOR_HIP)
-
-    add_executable(advection_reaction_3D_mpihip
-      advection_reaction_3D.cpp
-      arkode_driver.cpp
-      cvode_driver.cpp
-      ida_driver.cpp
-      rhs3D.hpp
-      ParallelGrid.hpp
-      backends.hpp)
-
-    target_include_directories(advection_reaction_3D_mpihip
-      PRIVATE
-      ${PROJECT_SOURCE_DIR}/utilities
-      ${MPI_CXX_INCLUDE_DIRS})
-
-    target_link_libraries(advection_reaction_3D_mpihip
-      PRIVATE
-      sundials_arkode
-      sundials_cvode
-      sundials_ida
-      sundials_nvecmpiplusx
-      sundials_nvechip
-      RAJA
-      hip::device
-      ${MPI_CXX_LIBRARIES}
-      ${OTHER_LIBS})
-
-    target_compile_definitions(advection_reaction_3D_mpihip PRIVATE USE_HIP_NVEC)
-
-    install(TARGETS advection_reaction_3D_mpihip
-      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D")
-
-  endif()
+if(ENABLE_RAJA)
+  add_subdirectory(raja)
+endif()
 
+if(ENABLE_KOKKOS AND BUILD_NVECTOR_KOKKOS)
+  add_subdirectory(kokkos)
 endif()
diff --git a/benchmarks/advection_reaction_3D/kokkos/CMakeLists.txt b/benchmarks/advection_reaction_3D/kokkos/CMakeLists.txt
new file mode 100644
index 0000000000..2d58e5fe4c
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/CMakeLists.txt
@@ -0,0 +1,61 @@
+# ---------------------------------------------------------------
+# Programmer(s):  Daniel R. Reynolds @ SMU
+# ---------------------------------------------------------------
+# SUNDIALS Copyright Start
+# Copyright (c) 2002-2023, Lawrence Livermore National Security
+# and Southern Methodist University.
+# All rights reserved.
+#
+# See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# SUNDIALS Copyright End
+# ---------------------------------------------------------------
+
+# Add the build targets for each backend
+if(BUILD_ARKODE AND BUILD_CVODE AND BUILD_IDA)
+  foreach(backend ${KOKKOS_EXAMPLES_BACKENDS})
+
+    # set benchmark target name
+    set(benchmark_target "advection_reaction_3D_kokkos.${backend}")
+
+    # benchmark source files
+    add_executable(${benchmark_target}
+      advection_reaction_3D.cpp
+      arkode_driver.cpp
+      cvode_driver.cpp
+      ida_driver.cpp
+      rhs3D.hpp
+      ParallelGrid.hpp
+      check_retval.h)
+
+      # which backend to use
+      target_compile_definitions(${benchmark_target} PRIVATE USE_${backend})
+
+      # directories to include
+      target_include_directories(${benchmark_target}
+        PRIVATE
+        ${PROJECT_SOURCE_DIR}/utilities
+        ${MPI_CXX_INCLUDE_DIRS}
+      )
+
+      # libraries to link against
+      target_link_libraries(${benchmark_target}
+        PRIVATE
+        sundials_arkode
+        sundials_cvode
+        sundials_ida
+        sundials_nvecmpiplusx
+        sundials_nveckokkos
+        ${MPI_CXX_LIBRARIES}
+        ${EXE_EXTRA_LINK_LIBS}
+      )
+
+    install(TARGETS ${benchmark_target}
+      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/kokkos")
+
+    install(FILES README.md ../scripts/compare_error.py ../scripts/compute_error.py ../scripts/pickle_solution_output.py
+      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/kokkos")
+
+  endforeach()
+endif()
diff --git a/benchmarks/advection_reaction_3D/kokkos/ParallelGrid.hpp b/benchmarks/advection_reaction_3D/kokkos/ParallelGrid.hpp
new file mode 100644
index 0000000000..c324105b02
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/ParallelGrid.hpp
@@ -0,0 +1,593 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Daniel R. Reynolds @ SMU
+ *                Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------
+ * A simple implementation of a parallel structured Cartesian mesh class that
+ * supports up to 3 spatial dimensions and an arbitrary number of degrees of
+ * freedom, and that uses Kokkos views to store communication buffer data.
+ * ----------------------------------------------------------------------------*/
+
+#ifndef _KOKKOSPARGRID_H
+#define _KOKKOSPARGRID_H
+
+#include <iomanip>
+#include <iostream>
+#include <fstream>
+#include <mpi.h>
+#include <Kokkos_Core.hpp>
+#include <sundials/sundials_types.h>
+
+
+/* Set Kokkos execution space and type shortcuts */
+#if defined(USE_CUDA)
+using ExecSpace = Kokkos::Cuda;
+using MemSpace  = Kokkos::CudaSpace;
+#elif defined(USE_HIP)
+#if KOKKOS_VERSION / 10000 > 3
+using ExecSpace = Kokkos::HIP;
+using MemSpace  = Kokkos::HIPSpace;
+#else
+using ExecSpace = Kokkos::Experimental::HIP;
+using MemSpace  = Kokkos::Experimental::HIPSpace;
+#endif
+#elif defined(USE_OPENMP)
+using ExecSpace = Kokkos::OpenMP;
+using MemSpace  = Kokkos::HostSpace;
+#else
+using ExecSpace = Kokkos::Serial;
+using MemSpace  = Kokkos::HostSpace;
+#endif
+using Vec1D = Kokkos::View<realtype*, MemSpace>;
+using Vec4D = Kokkos::View<realtype****, MemSpace>;
+using Vec1DHost = Vec1D::HostMirror;
+using Vec4DHost = Vec4D::HostMirror;
+using Range3D = Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>>;
+
+
+namespace sundials_tools
+{
+
+// Types of boundaries supported.
+enum class BoundaryType
+{
+  PERIODIC
+};
+
+// Types of stencils supported.
+enum class StencilType
+{
+  UPWIND
+};
+
+template<typename GLOBALINT>
+class ParallelGrid
+{
+public:
+  // Constructor that creates a new ParallelGrid object.
+  // [in] - the memory helper to use for allocating the MPI buffers
+  // [in,out] comm - on input, the overal MPI communicator, on output, the cartesian communicator
+  // [in] a[] - an array of length 3 which defines the domain [a,b]
+  // [in] b[] - an array of length 3 which defines the domain [a,b]
+  // [in] npts[] - an array of length 3 which defines the number of mesh points in each dimension
+  // [in] dof - the number of degrees of freedom in each dimension
+  // [in] bc - the type of boundary conditions (see BoundaryType)
+  // [in] st - the stencil to use (see StencilType)
+  // [in] npxyz - the number of processors in each dimension; defaults to 0 which means MPI will choose
+  // [in] reorder - should MPI_Cart_create do process reordering to optimize or not; defaults to false (some MPI implementations ignore this)
+  ParallelGrid(MPI_Comm* comm, const realtype a[], const realtype b[], const GLOBALINT npts[],
+               int dof, BoundaryType bc, StencilType st, const realtype c,
+               const int npxyz[] = nullptr, bool reorder = false)
+    : nx(1), ny(1), nz(1),
+      nxl(1), nyl(1), nzl(1),
+      npx(1), npy(1), npz(1),
+      dx(0.0), dy(0.0), dz(0.0),
+      ax(0.0), ay(0.0), az(0.0),
+      bx(0.0), by(0.0), bz(0.0),
+      dof(dof), dims{0,0,0}, coords{0,0,0},
+      bc(bc), st(st), upwindRight(true)
+  {
+    assert(st == StencilType::UPWIND);
+
+    /* Set up MPI Cartesian communicator */
+    if (npxyz)
+    {
+      dims[0] = npxyz[0];
+      dims[1] = npxyz[1];
+      dims[2] = npxyz[2];
+    }
+
+    int retval, nprocs;
+    MPI_Comm_size(*comm, &nprocs);
+    retval = MPI_Dims_create(nprocs, 3, dims);
+    assert(retval == MPI_SUCCESS);
+
+    int periods[] = { bc == BoundaryType::PERIODIC,
+                      bc == BoundaryType::PERIODIC,
+                      bc == BoundaryType::PERIODIC };
+    retval = MPI_Cart_create(*comm, 3, dims, periods, reorder, comm);
+    assert(retval == MPI_SUCCESS);
+
+    retval = MPI_Cart_get(*comm, 3, dims, periods, coords);
+    assert(retval == MPI_SUCCESS);
+
+    cart_comm = *comm;
+
+    /* Set upwinding direction */
+    upwindRight = (c > 0.0);
+
+    /* Set up information for the first spatial dimension */
+    npx = dims[0];
+    nx  = npts[0];
+    ax  = a[0];
+    bx  = b[0];
+    dx  = (bx-ax) / (realtype) nx;
+    int is = nx*(coords[0])/npx;
+    int ie = nx*(coords[0]+1)/npx-1;
+    nxl = ie-is+1;
+    neq = dof * nxl;
+
+    /* Set up information for the second spatial dimension */
+    npy = dims[1];
+    ny  = npts[1];
+    ay  = a[1];
+    by  = b[1];
+    dy  = (by-ay) / (realtype) ny;
+    int js = ny*(coords[1])/npy;
+    int je = ny*(coords[1]+1)/npy-1;
+    nyl = je-js+1;
+    neq *= nyl;
+
+    /* Set up information for the third spatial dimension */
+    npz = dims[2];
+    nz  = npts[2];
+    az  = a[2];
+    bz  = b[2];
+    dz  = (bz-az) / (realtype) nz;
+    int ks = nz*(coords[2])/npz;
+    int ke = nz*(coords[2]+1)/npz-1;
+    nzl = ke-ks+1;
+    neq *= nzl;
+
+    /* Allocate buffers for nearest-neighbor exchange */
+    if (st == StencilType::UPWIND)
+      AllocateBuffersUpwind();
+
+  }
+
+  // TODO:
+  //  - support non-periodic boundary conditions
+  // For all faces where neighbors exist: determine neighbor process indices.
+  // For all faces: allocate upwind exchange buffers.
+  void AllocateBuffersUpwind()
+  {
+
+    /* Allocate send/receive buffers and determine ID for communication West */
+    if (upwindRight) {
+      Wrecv_  = Vec1D("Wrecv", dof*nyl*nzl);
+      WrecvH_ = Kokkos::create_mirror_view(Wrecv_);
+    } else {
+      Wsend_  = Vec1D("Wsend", dof*nyl*nzl);
+      WsendH_ = Kokkos::create_mirror_view(Wsend_);
+    }
+    ipW = MPI_PROC_NULL;
+    if ((coords[0] > 0) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0]-1, coords[1], coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipW);
+      assert(retval == MPI_SUCCESS);
+    }
+
+    /* Allocate send/receive buffers and determine ID for communication East */
+    if (upwindRight) {
+      Esend_  = Vec1D("Esend", dof*nyl*nzl);
+      EsendH_ = Kokkos::create_mirror_view(Esend_);
+    } else {
+      Erecv_  = Vec1D("Erecv", dof*nyl*nzl);
+      ErecvH_ = Kokkos::create_mirror_view(Erecv_);
+    }
+    ipE = MPI_PROC_NULL;
+    if ((coords[0] < dims[0]-1) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0]+1, coords[1], coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipE);
+      assert(retval == MPI_SUCCESS);
+    }
+
+    /* Allocate send/receive buffers and determine ID for communication South */
+    if (upwindRight) {
+      Srecv_  = Vec1D("Srecv", dof*nxl*nzl);
+      SrecvH_ = Kokkos::create_mirror_view(Srecv_);
+    } else {
+      Ssend_  = Vec1D("Ssend", dof*nxl*nzl);
+      SsendH_ = Kokkos::create_mirror_view(Ssend_);
+    }
+    ipS = MPI_PROC_NULL;
+    if ((coords[1] > 0) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1]-1, coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipS);
+      assert(retval == MPI_SUCCESS);
+    }
+
+    /* Allocate send/receive buffers and determine ID for communication North */
+    if (upwindRight) {
+      Nsend_  = Vec1D("Nsend", dof*nxl*nzl);
+      NsendH_ = Kokkos::create_mirror_view(Nsend_);
+    } else {
+      Nrecv_  = Vec1D("Nrecv", dof*nxl*nzl);
+      NrecvH_ = Kokkos::create_mirror_view(Nrecv_);
+    }
+    ipN = MPI_PROC_NULL;
+    if ((coords[1] < dims[1]-1) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1]+1, coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipN);
+      assert(retval == MPI_SUCCESS);
+    }
+
+    /* Allocate send/receive buffers and determine ID for communication Back */
+    if (upwindRight) {
+      Brecv_  = Vec1D("Brecv", dof*nxl*nyl);
+      BrecvH_ = Kokkos::create_mirror_view(Brecv_);
+    } else {
+      Bsend_  = Vec1D("Bsend", dof*nxl*nyl);
+      BsendH_ = Kokkos::create_mirror_view(Bsend_);
+    }
+    ipB = MPI_PROC_NULL;
+    if ((coords[2] > 0) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1], coords[2]-1};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipB);
+      assert(retval == MPI_SUCCESS);
+    }
+
+    /* Allocate send/receive buffers and determine ID for communication Front */
+    if (upwindRight) {
+      Fsend_  = Vec1D("Fsend", dof*nxl*nyl);
+      FsendH_ = Kokkos::create_mirror_view(Fsend_);
+    } else {
+      Frecv_  = Vec1D("Frecv", dof*nxl*nyl);
+      FrecvH_ = Kokkos::create_mirror_view(Frecv_);
+    }
+    ipF = MPI_PROC_NULL;
+    if ((coords[2] < dims[2]-1) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1], coords[2]+1};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipF);
+      assert(retval == MPI_SUCCESS);
+    }
+
+  }
+
+  // Initiate non-blocking neighbor communication
+  int ExchangeStart()
+  {
+    int retval = 0;
+    nreq = 0;
+
+    // Initialize all requests in array
+    for (int i=0; i<12; i++)
+      req[i] = MPI_REQUEST_NULL;
+
+    // Open an Irecv buffer on host for each neighbor
+    if ((ipW != MPI_PROC_NULL) && (upwindRight))
+    {
+      retval = MPI_Irecv(WrecvH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipW,
+                         1, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipE != MPI_PROC_NULL) && (!upwindRight))
+    {
+      retval = MPI_Irecv(ErecvH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipE,
+                         0, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipS != MPI_PROC_NULL) && (upwindRight))
+    {
+      retval = MPI_Irecv(SrecvH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipS,
+                         3, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipN != MPI_PROC_NULL) && (!upwindRight))
+    {
+      retval = MPI_Irecv(NrecvH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipN,
+                         2, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipB != MPI_PROC_NULL) && (upwindRight))
+    {
+      retval = MPI_Irecv(BrecvH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipB,
+                         5, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipF != MPI_PROC_NULL) && (!upwindRight))
+    {
+      retval = MPI_Irecv(FrecvH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipF,
+                         4, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    // Send data to neighbors, first copying from device to host buffers
+    if ((ipW != MPI_PROC_NULL) && (!upwindRight))
+    {
+      Kokkos::deep_copy(WsendH_, Wsend_);
+      retval = MPI_Isend(WsendH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, 0,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipE != MPI_PROC_NULL) && (upwindRight))
+    {
+      Kokkos::deep_copy(EsendH_, Esend_);
+      retval = MPI_Isend(EsendH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, 1,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipS != MPI_PROC_NULL) && (!upwindRight))
+    {
+      Kokkos::deep_copy(SsendH_, Ssend_);
+      retval = MPI_Isend(SsendH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, 2,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipN != MPI_PROC_NULL) && (upwindRight))
+    {
+      Kokkos::deep_copy(NsendH_, Nsend_);
+      retval = MPI_Isend(NsendH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, 3,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipB != MPI_PROC_NULL) && (!upwindRight))
+    {
+      Kokkos::deep_copy(BsendH_, Bsend_);
+      retval = MPI_Isend(BsendH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, 4,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipF != MPI_PROC_NULL) && (upwindRight))
+    {
+      Kokkos::deep_copy(FsendH_, Fsend_);
+      retval = MPI_Isend(FsendH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, 5,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    return retval;
+  }
+
+  // Waits for neighbor exchange to finish.
+  int ExchangeEnd()
+  {
+    MPI_Status stat[12];
+    int retval;
+
+    // return automatically with success if there are no outstanding requests
+    if (nreq == 0)
+      return(0);
+
+    // Wait for messages to finish send/receive
+    retval = MPI_Waitall(nreq, req, stat);
+    assert(retval == MPI_SUCCESS);
+
+    // Copy data from host to device buffers
+    if ((ipW != MPI_PROC_NULL) && (upwindRight))
+      Kokkos::deep_copy(Wrecv_, WrecvH_);
+    if ((ipE != MPI_PROC_NULL) && (!upwindRight))
+      Kokkos::deep_copy(Erecv_, ErecvH_);
+    if ((ipS != MPI_PROC_NULL) && (upwindRight))
+      Kokkos::deep_copy(Srecv_, SrecvH_);
+    if ((ipN != MPI_PROC_NULL) && (!upwindRight))
+      Kokkos::deep_copy(Nrecv_, NrecvH_);
+    if ((ipB != MPI_PROC_NULL) && (upwindRight))
+      Kokkos::deep_copy(Brecv_, BrecvH_);
+    if ((ipF != MPI_PROC_NULL) && (!upwindRight))
+      Kokkos::deep_copy(Frecv_, FrecvH_);
+
+    return retval;
+  }
+
+  // Prints out information about the ParallelGrid to stdout.
+  void PrintInfo()
+  {
+    printf("ParallelGrid Info:\n");
+    printf("    dimensions = %d\n", 3);
+    printf("    processors = {%d, %d, %d}\n", npx, npy, npz);
+    printf("        domain = {[%g,%g], [%g,%g], [%g,%g]}\n", ax, bx, ay, by, az, bz);
+    printf("   global npts = {%li, %li, %li}\n", (long int) nx, (long int) ny, (long int) nz);
+    printf("    local npts = {%d, %d, %d}\n", nxl, nyl, nzl);
+    printf("  mesh spacing = {%g, %g, %g}\n", dx, dy, dz);
+    if (upwindRight)
+      printf("    upwind dir = right\n");
+    else
+      printf("    upwind dir = left\n");
+  }
+
+  // Saves the mesh to a file.
+  //    First row is x. Second row is y. Third row is z.
+  //    Can be loaded into MATLAB like so:
+  //      mesh = loadtxt('mesh.txt');
+  //      [X,Y,Z] = meshgrid(mesh(1,:),mesh(2,:),mesh(3,:));
+  void MeshToFile(const std::string& fname)
+  {
+    std::ofstream mesh_file;
+    mesh_file.open(fname);
+    mesh_file << std::setprecision(16);
+    for (GLOBALINT i = 0; i < nx; i++)
+      mesh_file << " " << dx*i;
+    mesh_file << std::endl;
+    for (GLOBALINT i = 0; i < ny; i++)
+      mesh_file << " " << dy*i;
+    mesh_file << std::endl;
+    for (GLOBALINT i = 0; i < nz; i++)
+      mesh_file << " " << dz*i;
+    mesh_file << std::endl;
+    mesh_file.close();
+  }
+
+  int nprocs() const
+  {
+    return npx*npy*npz;
+  }
+
+  GLOBALINT npts() const
+  {
+    return nx*ny*nz;
+  }
+
+  GLOBALINT nptsl() const
+  {
+    return nxl*nyl*nzl;
+  }
+
+  GLOBALINT neql() const
+  {
+    return dof*nptsl();
+  }
+
+  realtype* GetRecvView(const std::string& direction)
+  {
+    if (direction == "WEST")
+    {
+      return static_cast<realtype*>(Wrecv_.data());
+    }
+    else if (direction == "EAST")
+    {
+      return static_cast<realtype*>(Erecv_.data());
+    }
+    else if (direction == "NORTH")
+    {
+      return static_cast<realtype*>(Nrecv_.data());
+    }
+    else if (direction == "SOUTH")
+    {
+      return static_cast<realtype*>(Srecv_.data());
+    }
+    else if (direction == "FRONT")
+    {
+      return static_cast<realtype*>(Frecv_.data());
+    }
+    else if (direction == "BACK")
+    {
+      return static_cast<realtype*>(Brecv_.data());
+    }
+    else
+    {
+      assert(direction == "ILLEGAL");
+      return nullptr;
+    }
+  }
+
+  realtype* GetSendView(const std::string& direction)
+  {
+    if (direction == "WEST")
+    {
+      return static_cast<realtype*>(Wsend_.data());
+    }
+    else if (direction == "EAST")
+    {
+      return static_cast<realtype*>(Esend_.data());
+    }
+    else if (direction == "NORTH")
+    {
+      return static_cast<realtype*>(Nsend_.data());
+    }
+    else if (direction == "SOUTH")
+    {
+      return static_cast<realtype*>(Ssend_.data());
+    }
+    else if (direction == "FRONT")
+    {
+      return static_cast<realtype*>(Fsend_.data());
+    }
+    else if (direction == "BACK")
+    {
+      return static_cast<realtype*>(Bsend_.data());
+    }
+    else
+    {
+      assert(direction == "ILLEGAL");
+      return nullptr;
+    }
+  }
+
+  GLOBALINT nx, ny, nz;    /* number of intervals globally       */
+  int       nxl, nyl, nzl; /* number of intervals locally        */
+  int       npx, npy, npz; /* numner of processes                */
+  realtype  dx, dy, dz;    /* mesh spacing                       */
+  realtype  ax, ay, az;    /* domain in [a, b]                   */
+  realtype  bx, by, bz;
+  int       dof;           /* degrees of freedom per node        */
+  int       neq;           /* total number of equations locally  */
+
+  int       ipW, ipE;      /* MPI ranks for neighbor procs       */
+  int       ipS, ipN;
+  int       ipB, ipF;
+  bool      upwindRight;   /* Upwind dir: true/false == R/L      */
+
+  int       dims[3];
+  int       coords[3];
+
+
+private:
+  MPI_Comm     cart_comm;  /* MPI cartesian communicator         */
+  MPI_Request  req[12];
+  int          nreq;
+
+  BoundaryType bc;
+  StencilType  st;
+  
+  Vec1D Wsend_;            /* MPI send/recv buffers              */
+  Vec1D Esend_;
+  Vec1D Ssend_;
+  Vec1D Nsend_;
+  Vec1D Bsend_;
+  Vec1D Fsend_;
+  Vec1D Wrecv_;
+  Vec1D Erecv_;
+  Vec1D Srecv_;
+  Vec1D Nrecv_;
+  Vec1D Brecv_;
+  Vec1D Frecv_;
+  Vec1DHost WsendH_;       /* MPI send/recv buffers (host)       */
+  Vec1DHost EsendH_;
+  Vec1DHost SsendH_;
+  Vec1DHost NsendH_;
+  Vec1DHost BsendH_;
+  Vec1DHost FsendH_;
+  Vec1DHost WrecvH_;
+  Vec1DHost ErecvH_;
+  Vec1DHost SrecvH_;
+  Vec1DHost NrecvH_;
+  Vec1DHost BrecvH_;
+  Vec1DHost FrecvH_;
+
+};
+
+}
+
+#endif
diff --git a/benchmarks/advection_reaction_3D/kokkos/README.md b/benchmarks/advection_reaction_3D/kokkos/README.md
new file mode 100644
index 0000000000..f27484385f
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/README.md
@@ -0,0 +1,113 @@
+# Benchmark: 3D Advection-Reaction
+
+This benchmark problem implements a 3D advection-reaction equation using the
+Kokkos performance portability layer with serial, OpenMP, CUDA, or HIP backends.
+
+## Problem description
+
+This code simulates the advection and reaction of three chemical species where
+the reaction mechanism is a variation of the Brusselator problem from chemical
+kinetics. The PDE system is given by
+```math
+\begin{align}
+  u_t &= -c \nabla u + A - (w+1) u + v u^2 \\
+  v_t &= -c \nabla v + w u - v u^2 \\
+  w_t &= -c \nabla w + (B - w) / \epsilon - w u
+\end{align}
+```
+where $u$, $v$, and $w$ are chemical concentrations, $c$ is the advection speed,
+$A$ and $B$ are the concentrations of chemical species that remain constant over
+space and time, and $\epsilon$ is a parameter that varies the stiffness of the
+system. The problem is solved on the domain $(x,y,z) = X$ in $[0, X_{\text{max}}]^3$, 
+for times $t$ in $[0,t_f]$. The initial condition is
+```math
+\begin{align}
+    u(0,X) &= A + p(X) \\
+    v(0,X) &= B / A + p(X) \\
+    w(0,X) &= 3.0 + p(X)
+\end{align}
+```
+where the perturbation function is
+```math
+    p(X) = \alpha e^{-(X-\mu)^T \sigma^{-1} (X-\mu) / 2 \sqrt{|\sigma| 8 \pi^3}}
+```
+with $\alpha = 0.1$, $\mu = 0.5 X_{\text{max}}$, and $\sigma$ is a diagonal 
+matrix with entries $0.25 X_{\text{max}}$.
+
+Spatial derivatives are discretized with first-order upwind finite differences
+on a uniform spatial grid. The system can be evolved in time using explicit,
+implicit, or IMEX methods from ARKODE, Adams or BDF methods from CVODE, or BDF
+methods from IDA. When using an IMEX method, advection is treated explicitly and
+reactions implicitly.
+
+The nonlinear system(s) that arise in each time step may be solved using a
+global Newton method with a matrix-free GMRES linear solver or an Anderson
+accelerated fixed-point method. When using an IMEX method, a custom task-local
+nonlinear solver that leverages the locality of the reaction systems may also be
+used.
+
+## Options
+
+Several command line options are available to change the problem parameters
+as well as the integrator and solver options. A summary of the options are
+listed below.
+
+| Option                      | Description                                                                   | Default     |
+|:----------------------------|:------------------------------------------------------------------------------|:------------|
+| `--help`                    | Print the command line options and description                                | --          |
+| `--dont-save`               | Do not save the solution to the disk                                          | Save        |
+| `--output-dir <dir>`        | Directory where all output files will be written                              | `.`         |
+| `--nout <int>`              | Number of output times                                                        | 40          |
+| `--npts <int>`              | Number of mesh points in each direction                                       | 100         |
+| `--npxyz <int> <int> <int>` | Number of MPI tasks in each direction (0 forces MPI to decide)                | 0 0 0       |
+| `--xmax <realtype>`         | Maximum value of `x`, `y`, and `z` in :math:`X_max`                           | 1.0         |
+| `--A <realtype>`            | Constant concentration of species `A`                                         | 1.0         |
+| `--B <realtype>`            | Constant concentration of species `B`                                         | 3.5         |
+| `--c <realtype>`            | Advection speed `c`                                                           | 0.01        |
+| `--order <int>`             | Integration method order                                                      | 3           |
+| `--method <method>`         | Integrator to use: `ERK`, `ARK-DIRK`, `ARK-IMEX`, `CV-BDF`, `CV-ADAMS`, `IDA` | `ARK-DIRK`  |
+| `--nls <method>`            | Nonlinear Solver Method: `newton`, `tl-newton`, `fixedpoint`, `none`          | `newton`    |
+| `--fpaccel <int>`           | Number of fixed point acceleration vectors                                    | 3           |
+| `--nopre`                   | Disable preconditioning                                                       | False       |
+| `--fused`                   | Enabled fused operations                                                      | Off         |
+| `--tf <realtype>`           | Final integration time `t_f`                                                  | 10.0        |
+| `--rtol <realtype>`         | Relative tolerance                                                            | 1.0e-6      |
+| `--atol <realtype>`         | Absolute tolerance                                                            | 1.0e-9      |
+
+## Building and Running
+
+To build the benchmark executables SUNDIALS must be configured with ARKODE,
+CVODE, and IDA enabled and with MPI and Kokkos support on. Additionally, either
+CUDA or HIP support must be on to build executables utilizing NVIDIA or AMD
+GPUs. See the installation guide for more details on configuring, building,
+and installing SUNDIALS.
+
+Based on the configuration the following executables will be built and installed
+in the `<benchmarks install prefix>/advection_reaction_3D/kokkos` directory:
+
+* `advection_reaction_3D_kokkos.SERIAL` -- MPI parallelism
+* `advection_reaction_3D_kokkos.OPENMP` -- MPI + OpenMP parallelism
+* `advection_reaction_3D_kokkos.CUDA` -- MPI + CUDA parallelism
+* `advection_reaction_3D_kokkos.HIP` -- MPI + HIP parallelism
+
+On Summit, with the default environment
+```
+  Compiler: xl/16.1.1-5
+  MPI: spectrum-mpi/10.3.1.2-20200121
+  CUDA: cuda/10.1.243
+```
+an example `jsrun` command is
+```
+jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_kokkos.CUDA
+```
+
+On Lassen, with the environment
+```
+  Compiler: gcc/8.3.1
+  MPI: mvapich2/2021.05.28-cuda-11.1.1
+  CUDA: cuda/11.1.1
+```
+an example `jsrun` command is
+```
+jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_kokkos.CUDA
+```
diff --git a/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.cpp b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.cpp
new file mode 100644
index 0000000000..fa9f2bcc94
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.cpp
@@ -0,0 +1,711 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Daniel R. Reynolds @ SMU
+ *                David J. Gardner, Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------
+ * This benchmark problem simulates the advection and reaction of three
+ * chemical species, u, v, and w, in a three dimensional domain. The reaction
+ * mechanism is a variation of the Brusselator problem from chemical kinetics.
+ * This is a PDE system with 3 components, Y = [u,v,w], satisfying the
+ * equations,
+ *
+ *    u_t = -c * dot(grad,u) + A - (w+1) * u + v * u^2
+ *    v_t = -c * dot(grad,v) + w * u - v * u^2
+ *    w_t = -c * dot(grad,w) + (B - w) / ep - w * u
+ *
+ * for t in [0,tf], X = (x,y,z) where in (x,y,z) in [0,xmax] with periodic
+ * boundary conditions. The initial condition is
+ *
+ *    u(0,X) = k1 * A / k4 + p(X)
+ *    v(0,X) = k2 * k4 * B / (k1 * k3 * A) + p(X)
+ *    w(0,X) = 3.0 + p(X)
+ *    p(X)   = alpha * e^( -((X - mu)^T Sigma^{-1} (x-mu)) / (2*sqrt(|Sigma|*(2pi)^3)) )
+ *
+ * alpha = 0.1, mu = (xmax/2.0, xmax/2.0, xmax/2.0), and Sigma = diag(xmax/4.0).
+ * The reaction rates are set so k_1 = k_2 = k_3 = k_4 = k, and k_5 = k_6
+ * = 1/5e-6. The spatial derivatives are discretized with first-order upwind
+ * finite differences. NOUT outputs are printed at equal intervals, and run
+ * statistics are printed at the end.
+ *
+ * Command line options:
+ *   --help             prints this message
+ *   --dont-save        do not save the solution to the filesystem at the nout interval (default is to save)
+ *   --output-dir       the directory where all output files will be written
+ *   --nout <int>       number of output times
+ *   --method           ERK, ARK-DIRK, ARK-IMEX (default), CV-BDF, CV-ADAMS, IDA
+ *   --nls              nonlinear solver to use; options are newton,
+ *                      tl-newton (task-local newton), or fixedpoint
+ *   --fpaccel          the number of fixed-point acceleration vectors to use
+ *                      (only valid when using fixedpoint nonlinear solver)
+ *   --nopre            turn off preconditioning
+ *   --order <int>      the method order to use
+ *   --npts <int>       number of mesh points in each direction
+ *   --xmax <realtype>  maximum value of x (size of domain)
+ *   --tf <realtype>    final time
+ *   --A <realtype>     A parameter value
+ *   --B <realtype>     B parameter value
+ *   --k <realtype>     reaction rate
+ *   --c <realtype>     advection speed
+ *   --rtol <realtype>  relative tolerance
+ *   --atol <realtype>  absolute tolerance
+ * --------------------------------------------------------------------------*/
+
+#include "advection_reaction_3D.hpp"
+
+
+/* Main Program */
+int main(int argc, char *argv[])
+{
+
+  SUNContext ctx;
+
+  /* Initialize MPI */
+  MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Init(&argc, &argv);
+
+  /* Create SUNDIALS context */
+  SUNContext_Create((void*) &comm, &ctx);
+
+  /* Initialize Kokkos */
+  Kokkos::initialize(argc, argv);
+  {
+
+    /* General problem variables */
+    N_Vector     y = NULL;      /* empty solution vector        */
+    UserData     udata(ctx);    /* user data                    */
+    UserOptions  uopt;          /* user options                 */
+    int          retval;        /* reusable error-checking flag */
+
+    SUNDIALS_CXX_MARK_FUNCTION(udata.prof);
+
+    /* Process input arguments and set up the problem */
+    retval = SetupProblem(argc, argv, &udata, &uopt, ctx);
+    if (check_retval(&retval, "SetupProblem", 1, udata.myid)) MPI_Abort(comm, 1);
+
+    /* Create solution vector (on-node and MPI-parallel versions) */
+    SUNVector yloc{(unsigned int)udata.grid->neq, ctx};
+    y = N_VMake_MPIPlusX(udata.comm, yloc, ctx);
+    if (check_retval((void *) y, "N_VMake_MPIPlusX", 0, udata.myid)) MPI_Abort(comm, 1);
+
+    /* Set the initial condition */
+    retval = SetIC(y, &udata);
+    if (check_retval(&retval, "SetIC", 1, udata.myid)) MPI_Abort(comm, 1);
+
+    /* Output spatial mesh to disk (add extra point for periodic BC) */
+    if (udata.myid == 0 && uopt.nout > 0)
+    {
+      char fname[MXSTR];
+      snprintf(fname, MXSTR, "%s/mesh.txt", uopt.outputdir);
+      udata.grid->MeshToFile(fname);
+    }
+
+    /* Integrate in time */
+    if (uopt.method == "ERK")           retval = EvolveProblemExplicit(y, &udata, &uopt);
+    else if (uopt.method == "ARK-DIRK") retval = EvolveProblemDIRK(y, &udata, &uopt);
+    else if (uopt.method == "ARK-IMEX") retval = EvolveProblemIMEX(y, &udata, &uopt);
+    else if (uopt.method == "CV-BDF")   retval = EvolveProblemBDF(y, &udata, &uopt);
+    else if (uopt.method == "CV-ADAMS") retval = EvolveProblemAdams(y, &udata, &uopt);
+    else if (uopt.method == "IDA")      retval = EvolveDAEProblem(y, &udata, &uopt);
+    if (check_retval(&retval, "Evolve", 1, udata.myid)) MPI_Abort(comm, 1);
+
+    /* Clean up */
+    N_VDestroy(y);
+  }
+  Kokkos::finalize();
+  SUNContext_Free(&ctx);
+  MPI_Finalize();
+  return(0);
+}
+
+
+/* Destructor for problem data */
+UserData::~UserData()
+{
+  /* close output streams */
+  if (uopt->nout > 0)
+  {
+    if (UFID) fclose(UFID);
+    if (VFID) fclose(VFID);
+    if (WFID) fclose(WFID);
+    if (TFID && myid == 0) fclose(TFID);
+  }
+
+  /* free solution masks */
+  if (umask != nullptr) {
+    N_VDestroy(umask);
+    umask = nullptr;
+  }
+  if (vmask != nullptr) {
+    N_VDestroy(vmask);
+    vmask = nullptr;
+  }
+  if (wmask != nullptr) {
+    N_VDestroy(wmask);
+    wmask = nullptr;
+  }
+
+  /* free the parallel grid */
+  delete grid;
+}
+
+
+/* --------------------------------------------------------------
+ * Communication functions
+ * --------------------------------------------------------------*/
+
+/* Fills send buffers before exchanging neighbor information */
+int FillSendBuffers(N_Vector y, UserData* udata)
+{
+
+  /* Shortcuts */
+  const realtype c = udata->c;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const int dof = udata->grid->dof;
+
+  /* Create 4D view of the vector */
+  Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+
+  if (c > 0.0)
+  {
+
+    /* Flow moving in the positive directions uses backward difference. */
+
+    /* Create 4D views of send buffers */
+    Vec4D Esend(udata->grid->GetSendView("EAST"),  1, nyl, nzl, dof);
+    Vec4D Nsend(udata->grid->GetSendView("NORTH"), nxl, 1, nzl, dof);
+    Vec4D Fsend(udata->grid->GetSendView("FRONT"), nxl, nyl, 1, dof);
+
+    /* Fill buffers on device */
+    Kokkos::parallel_for("FillEastBuffer",
+                         Range3D({0,0,0},{nyl,nzl,dof}),
+                         KOKKOS_LAMBDA (int j, int k, int l) {
+      Esend(0,j,k,l) = Yview(nxl-1,j,k,l);
+    });
+    Kokkos::parallel_for("FillNorthBuffer",
+                         Range3D({0,0,0},{nxl,nzl,dof}),
+                         KOKKOS_LAMBDA (int i, int k, int l) {
+      Nsend(i,0,k,l) = Yview(i,nyl-1,k,l);
+    });
+    Kokkos::parallel_for("FillFrontBuffer",
+                         Range3D({0,0,0},{nxl,nyl,dof}),
+                         KOKKOS_LAMBDA (int i, int j, int l) {
+      Fsend(i,j,0,l) = Yview(i,j,nzl-1,l);
+    });
+
+  }
+  else if (c < 0.0)
+  {
+
+    /* Flow moving in the negative directions uses forward difference. */
+
+    /* Create 4D views of send buffers */
+    Vec4D Wsend(udata->grid->GetSendView("WEST"),  1, nyl, nzl, dof);
+    Vec4D Ssend(udata->grid->GetSendView("SOUTH"), nxl, 1, nzl, dof);
+    Vec4D Bsend(udata->grid->GetSendView("BACK"),  nxl, nyl, 1, dof);
+
+    /* Fill buffers on device */
+    Kokkos::parallel_for("FillWestBuffer",
+                         Range3D({0,0,0},{nyl,nzl,dof}),
+                         KOKKOS_LAMBDA (int j, int k, int l) {
+      Wsend(0,j,k,l) = Yview(0,j,k,l);
+    });
+    Kokkos::parallel_for("FillSouthBuffer",
+                         Range3D({0,0,0},{nxl,nzl,dof}),
+                         KOKKOS_LAMBDA (int i, int k, int l) {
+      Ssend(i,0,k,l) = Yview(i,0,k,l);
+    });
+    Kokkos::parallel_for("FillBackBuffer",
+                         Range3D({0,0,0},{nxl,nyl,dof}),
+                         KOKKOS_LAMBDA (int i, int j, int l) {
+      Bsend(i,j,0,l) = Yview(i,j,0,l);
+    });
+
+  }
+
+  return(0);
+}
+
+
+/* --------------------------------------------------------------
+ * Problem setup
+ * --------------------------------------------------------------*/
+
+/* Parses the CLI arguments */
+int ParseArgs(int argc, char *argv[], UserData* udata, UserOptions* uopt)
+{
+  /* check for input args */
+  if (argc > 1)
+  {
+    /* loop over input args and get value */
+    for (int i = 1; i < argc; i++)
+    {
+      string argvi(argv[i]);
+
+      if (argvi.compare("--help") == 0)
+      {
+        InputError(argv[0]);
+        return(-1);
+      }
+      else if (argvi.compare("--nout") == 0)
+      {
+        uopt->nout = atoi(argv[++i]);
+      }
+      else if (argvi.compare("--dont-save") == 0)
+      {
+        uopt->save = 0;
+      }
+      else if (argvi.compare("--output-dir") == 0)
+      {
+        if (strlen(argv[i+1]) > MXSTR)
+        {
+          if (udata->myid == 0)
+            fprintf(stderr, "ERROR: output directory string is too long\n");
+          return(-1);
+        }
+        uopt->outputdir = argv[++i];
+      }
+      else if (argvi.compare("--npts") == 0)
+      {
+        uopt->npts = atoi(argv[++i]);
+      }
+      else if (argvi.compare("--npxyz") == 0)
+      {
+        uopt->npxyz[0] = atoi(argv[++i]);
+        uopt->npxyz[1] = atoi(argv[++i]);
+        uopt->npxyz[2] = atoi(argv[++i]);
+      }
+      else if (argvi.compare("--xmax") == 0)
+      {
+        udata->xmax = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--A") == 0)
+      {
+        udata->A = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--B") == 0)
+      {
+        udata->B = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--k") == 0)
+      {
+        udata->k1 = strtod(argv[++i], NULL);
+        udata->k2 = strtod(argv[++i], NULL);
+        udata->k3 = strtod(argv[++i], NULL);
+        udata->k4 = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--c") == 0)
+      {
+        udata->c = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--order") == 0)
+      {
+        uopt->order = atoi(argv[++i]);
+      }
+      else if (argvi.compare("--method") == 0)
+      {
+        uopt->method = string(argv[++i]);
+        if (uopt->method != "ERK" &&
+            uopt->method != "ARK-DIRK" &&
+            uopt->method != "ARK-IMEX" &&
+            uopt->method != "CV-BDF" &&
+            uopt->method != "CV-ADAMS" &&
+            uopt->method != "IDA")
+        {
+          fprintf(stderr, "ERROR: unknown method\n");
+          InputError(argv[0]);
+          return(-1);
+        }
+      }
+      else if (argvi.compare("--fpaccel") == 0)
+      {
+        uopt->fpaccel = atoi(argv[++i]);
+      }
+      else if (argvi.compare("--nls") == 0)
+      {
+        uopt->nls = string(argv[++i]);
+        if (uopt->nls != "newton" &&
+            uopt->nls != "tl-newton" &&
+            uopt->nls != "fixedpoint" &&
+            uopt->nls != "none")
+        {
+          fprintf(stderr, "ERROR: unknown nls\n");
+          InputError(argv[0]);
+          return(-1);
+        }
+      }
+      else if (argvi.compare("--nopre") == 0)
+      {
+        uopt->precond = 0;
+      }
+      else if (argvi.compare("--fused") == 0)
+      {
+        uopt->fused = 1;
+      }
+      else if (argvi.compare("--tf") == 0)
+      {
+        uopt->tf = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--rtol") == 0)
+      {
+        uopt->rtol = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--atol") == 0)
+      {
+        uopt->atol = strtod(argv[++i], NULL);
+      }
+      else
+      {
+        InputError(argv[0]);
+        return(-1);
+      }
+    }
+  }
+
+  /* Explicit method uses no nonlinear solver */
+  if (uopt->method == "ERK")
+    uopt->nls = "none";
+
+  /* CV Adams method only uses fixedpoint nonlinear solver */
+  if (uopt->method == "CV-ADAMS")
+    uopt->nls = "fixedpoint";
+
+  return(0);
+}
+
+
+/* Fills the mask vector for the component so that
+   u = y .* umask, v = y .* vmask, w = y .* wmask */
+int ComponentMask(N_Vector mask, const int component, const UserData* udata)
+{
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* Shortcuts */
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const int dof = udata->grid->dof;
+
+  /* Create 4D view of mask data */
+  Vec4D maskview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(mask)), nxl, nyl, nzl, dof);
+
+  /* Fill mask data */
+  N_VConst(0.0, mask);
+  Kokkos::parallel_for("Fill_mask",
+                       Range3D({0,0,0},{nxl,nyl,nzl}),
+                       KOKKOS_LAMBDA (int i, int j, int k)
+  {
+    maskview(i,j,k,component) = 1.0;
+  });
+
+  return 0;
+}
+
+
+/* Parses the CLI arguments and sets up the problem */
+int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
+                 SUNContext ctx)
+{
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* MPI variables */
+  udata->comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(udata->comm, &udata->myid);
+  MPI_Comm_size(udata->comm, &udata->nprocs);
+
+  /* Default problem parameters */
+  udata->add_reactions = true;
+  udata->xmax  = 1.0;
+  udata->A     = 1.0;
+  udata->B     = 3.5;
+  udata->k1    = 1.0;
+  udata->k2    = 1.0;
+  udata->k3    = 1.0;
+  udata->k4    = 1.0;
+  udata->k5    = 1.0/5.0e-6;
+  udata->k6    = 1.0/5.0e-6;
+  udata->c     = 0.01;
+  udata->uopt  = uopt;
+  udata->TFID  = NULL;
+  udata->UFID  = NULL;
+  udata->VFID  = NULL;
+  udata->WFID  = NULL;
+  udata->nnlfi = 0;
+
+  /* Set default integrator options */
+  uopt->npxyz[0]  = 0;            /* number of processesors in x */
+  uopt->npxyz[1]  = 0;            /* number of processesors in y */
+  uopt->npxyz[2]  = 0;            /* number of processesors in z */
+  uopt->npts      = 100;          /* number of mesh points in each direction */
+  uopt->order     = 3;            /* method order             */
+  uopt->method    = "ARK-DIRK";   /* stepper/method           */
+  uopt->t0        = 0.0;          /* initial time             */
+  uopt->tf        = 10.0;         /* final time               */
+  uopt->rtol      = 1.0e-6;       /* relative tolerance       */
+  uopt->atol      = 1.0e-9;       /* absolute tolerance       */
+  uopt->nls       = "newton";     /* default to newton, when appropriate */
+  uopt->fpaccel   = 3;            /* default number of fixed point acceleration vectors */
+  uopt->precond   = 1;            /* by default, precondition when appropriate */
+  uopt->fused     = 0;            /* use fused vector ops     */
+  uopt->save      = 1;            /* save solution to disk    */
+  uopt->nout      = 10;           /* number of output times   */
+  uopt->outputdir = (char *) "."; /* output directory         */
+
+  /* Parse CLI args and set udata/uopt appropriately */
+  int retval = ParseArgs(argc, argv, udata, uopt);
+  if (check_retval((void*)&retval, "ParseArgs", 1, udata->myid)) return -1;
+
+  /* Setup the parallel decomposition */
+  const sunindextype npts[] = {uopt->npts, uopt->npts, uopt->npts};
+  const realtype amax[] = {0.0, 0.0, 0.0};
+  const realtype bmax[] = {udata->xmax, udata->xmax, udata->xmax};
+  udata->grid = new ParallelGrid<sunindextype>(&udata->comm, amax, bmax, npts,
+      3, BoundaryType::PERIODIC, StencilType::UPWIND, udata->c, uopt->npxyz);
+
+  /* Create the solution masks */
+  SUNVector *umaskloc = new SUNVector((unsigned int)udata->grid->neq, ctx);
+  udata->umask = N_VMake_MPIPlusX(udata->comm, *umaskloc, ctx);
+  if (check_retval((void *) udata->umask, "N_VMake_MPIPlusX", 0, udata->myid)) MPI_Abort(udata->comm, 1);
+  SUNVector *vmaskloc = new SUNVector((unsigned int)udata->grid->neq, ctx);
+  udata->vmask = N_VMake_MPIPlusX(udata->comm, *vmaskloc, ctx);
+  if (check_retval((void *) udata->vmask, "N_VMake_MPIPlusX", 0, udata->myid)) MPI_Abort(udata->comm, 1);
+  SUNVector *wmaskloc = new SUNVector((unsigned int)udata->grid->neq, ctx);
+  udata->wmask = N_VMake_MPIPlusX(udata->comm, *wmaskloc, ctx);
+  if (check_retval((void *) udata->wmask, "N_VMake_MPIPlusX", 0, udata->myid)) MPI_Abort(udata->comm, 1);
+  ComponentMask(udata->umask, 0, udata);
+  ComponentMask(udata->vmask, 1, udata);
+  ComponentMask(udata->wmask, 2, udata);
+
+  /* Open output files for results */
+  if (uopt->save)
+  {
+    char fname[MXSTR];
+    if (udata->myid == 0)
+    {
+      sprintf(fname, "%s/t.%06d.txt", uopt->outputdir, udata->myid);
+      udata->TFID = fopen(fname, "w");
+    }
+
+    sprintf(fname, "%s/u.%06d.txt", uopt->outputdir, udata->myid);
+    udata->UFID = fopen(fname, "w");
+
+    sprintf(fname, "%s/v.%06d.txt", uopt->outputdir, udata->myid);
+    udata->VFID = fopen(fname, "w");
+
+    sprintf(fname, "%s/w.%06d.txt", uopt->outputdir, udata->myid);
+    udata->WFID = fopen(fname, "w");
+  }
+
+  /* Print problem setup */
+  if (udata->myid == 0)
+  {
+    printf("\n\t\tAdvection-Reaction Test Problem\n\n");
+    printf("Using the MPI+Kokkos NVECTOR");
+#if defined(USE_CUDA)
+    printf(" with the CUDA back-end\n");
+#elif defined(USE_HIP)
+    printf(" with the HIP back-end\n");
+#elif defined(USE_OPENMP)
+    printf(" with the OpenMP back-end and %i threads\n", omp_get_max_threads());
+#else
+    printf(" with the serial back-end\n");
+#endif
+    printf("Number of Processors = %li\n", (long int) udata->nprocs);
+    udata->grid->PrintInfo();
+    printf("Problem Parameters:\n");
+    printf("  A = %g\n", udata->A);
+    printf("  B = %g\n", udata->B);
+    printf("  k = %g\n", udata->k1);
+    printf("  c = %g\n", udata->c);
+    printf("Integrator Options:\n");
+    printf("  order            = %d\n", uopt->order);
+    printf("  method           = %s\n", uopt->method.c_str());
+    printf("  nonlinear solver = %s\n", uopt->nls.c_str());
+    printf("  fpaccel          = %d\n", uopt->fpaccel);
+    printf("  preconditioner   = %d\n", uopt->precond);
+    printf("  fused vector ops = %d\n", uopt->fused);
+    printf("  t0               = %g\n", uopt->t0);
+    printf("  tf               = %g\n", uopt->tf);
+    printf("  reltol           = %.1e\n", uopt->rtol);
+    printf("  abstol           = %.1e\n", uopt->atol);
+    printf("  nout             = %d\n", uopt->nout);
+    printf("Output directory: %s\n", uopt->outputdir);
+  }
+
+
+  /* return success */
+  return(0);
+}
+
+
+/* Compute the 3D Gaussian function. */
+KOKKOS_FUNCTION
+void Gaussian3D(realtype& x, realtype& y, realtype& z, realtype xmax)
+{
+  /* Gaussian distribution defaults */
+  const realtype alpha = 0.1;
+  const realtype mu[] = { xmax/RCONST(2.0), xmax/RCONST(2.0), xmax/RCONST(2.0) };
+  const realtype sigma[] = { xmax/RCONST(4.0), xmax/RCONST(4.0), xmax/RCONST(4.0) }; // Sigma = diag(sigma)
+
+  /* denominator = 2*sqrt(|Sigma|*(2pi)^3) */
+  const realtype denom = 2.0 * sqrt((sigma[0]*sigma[1]*sigma[2])*pow(2*M_PI,3));
+  x = alpha * exp( -((x - mu[0])*(x - mu[0])*(1.0/sigma[0])) / denom );
+  y = alpha * exp( -((y - mu[1])*(y - mu[1])*(1.0/sigma[1])) / denom );
+  z = alpha * exp( -((z - mu[2])*(z - mu[2])*(1.0/sigma[2])) / denom );
+}
+
+
+/* Initial condition function */
+int SetIC(N_Vector y, UserData* udata)
+{
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* Variable shortcuts */
+  const int      nxl  = udata->grid->nxl;
+  const int      nyl  = udata->grid->nyl;
+  const int      nzl  = udata->grid->nzl;
+  const int      dof  = udata->grid->dof;
+  const realtype dx   = udata->grid->dx;
+  const realtype dy   = udata->grid->dy;
+  const realtype dz   = udata->grid->dz;
+  const realtype xmax = udata->xmax;
+  const realtype A    = udata->A;
+  const realtype B    = udata->B;
+  const realtype k1   = udata->k1;
+  const realtype k2   = udata->k2;
+  const realtype k3   = udata->k3;
+  const realtype k4   = udata->k4;
+  const int      xcrd = udata->grid->coords[0];
+  const int      ycrd = udata->grid->coords[1];
+  const int      zcrd = udata->grid->coords[2];
+
+  /* Steady state solution */
+  const realtype us = k1 * A / k4;
+  const realtype vs = k2 * k4 * B / (k1 * k3 * A);
+  const realtype ws = 3.0;
+
+  /* Create 4D view of y */
+  Vec4D yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+
+  /* Gaussian perturbation of the steady state solution */
+  Kokkos::parallel_for("SetIC",
+                       Range3D({0,0,0},{nxl,nyl,nzl}),
+                       KOKKOS_LAMBDA (int i, int j, int k)
+  {
+    realtype x = (xcrd * nxl + i) * dx;
+    realtype y = (ycrd * nyl + j) * dy;
+    realtype z = (zcrd * nzl + k) * dz;
+    Gaussian3D(x,y,z,xmax);
+    const realtype p = x + y + z;
+    yview(i,j,k,0) = us + p;
+    yview(i,j,k,1) = vs + p;
+    yview(i,j,k,2) = ws + p;
+  });
+
+  /* Return success */
+  return(0);
+}
+
+
+/* Write time and solution to disk */
+int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* output current solution norm to screen */
+  realtype N = (realtype) udata->grid->npts();
+  realtype u = N_VWL2Norm(y, udata->umask);
+  u = sqrt(u*u/N);
+  realtype v = N_VWL2Norm(y, udata->vmask);
+  v = sqrt(v*v/N);
+  realtype w = N_VWL2Norm(y, udata->wmask);
+  w = sqrt(w*w/N);
+  if (udata->myid == 0) {
+    printf("     %10.6f   %10.6f   %10.6f   %10.6f\n", t, u, v, w);
+    std::fflush(stdout);
+  }
+
+  if (uopt->save)
+  {
+    /* Copy solution data to host mirror view */
+    SUNVector* ylocal = sundials::kokkos::GetVec<SUNVector>(N_VGetLocalVector_MPIPlusX(y));
+    sundials::kokkos::CopyFromDevice(*ylocal);
+
+    /* output the times to disk */
+    if (udata->myid == 0 && udata->TFID) {
+      fprintf(udata->TFID," %.16e\n", t);
+      std::fflush(udata->TFID);
+    }
+
+    /* create 4D view of host data */
+    const int nxl = udata->grid->nxl;
+    const int nyl = udata->grid->nyl;
+    const int nzl = udata->grid->nzl;
+    const int dof = udata->grid->dof;
+    Vec4DHost yview(N_VGetArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+
+    /* output results to disk */
+    for (int i = 0; i < nxl; i++)
+      for (int j = 0; j < nyl; j++)
+        for (int k = 0; k < nzl; k++) {
+          fprintf(udata->UFID," %.16e", yview(i,j,k,0));
+          fprintf(udata->VFID," %.16e", yview(i,j,k,1));
+          fprintf(udata->WFID," %.16e", yview(i,j,k,2));
+        }
+
+    fprintf(udata->UFID,"\n");
+    fprintf(udata->VFID,"\n");
+    fprintf(udata->WFID,"\n");
+    std::fflush(udata->UFID);
+    std::fflush(udata->VFID);
+    std::fflush(udata->WFID);
+  }
+
+  return(0);
+}
+
+
+void InputError(char *name)
+{
+  int myid;
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+  if (myid == 0)
+  {
+    fprintf(stderr, "\nERROR: Invalid command line input\n");
+    fprintf(stderr, "\nCommand line options for %s\n",name);
+    fprintf(stderr, "  --help                    prints this message\n");
+    fprintf(stderr, "  --output-dir              the directory where all output files will be written (default is the CWD)\n");
+    fprintf(stderr, "  --nout <int>              number of output times to print (default is 10)\n");
+    fprintf(stderr, "  --dont-save               do not save the solution to the filesystem at the nout interval (default is to save)\n");
+    fprintf(stderr, "  --method                  ERK, ARK-DIRK, ARK-IMEX (default), CV-BDF, CV-ADAMS, IDA\n");
+    fprintf(stderr, "  --fpaccel                 the number of fixed-point acceleration vectors to use (only valid when using fixedpoint nonlinear solver)\n");
+    fprintf(stderr, "  --nls                     nonlinear solver to use (newton, tl-newton (task-local newton), fixedpoint)\n");
+    fprintf(stderr, "  --nopre                   do not precondition the linear system\n");
+    fprintf(stderr, "  --order <int>             the method order to use\n");
+    fprintf(stderr, "  --npts <int>              number of mesh points in each direction\n");
+    fprintf(stderr, "  --npxyz <int> <int> <int> number of processors in each direction (0 forces MPI to decide)\n");
+    fprintf(stderr, "  --xmax <realtype>         maximum value of x (size of domain)\n");
+    fprintf(stderr, "  --tf <realtype>           final time\n");
+    fprintf(stderr, "  --A <realtype>            A parameter value\n");
+    fprintf(stderr, "  --B <realtype>            B parameter value\n");
+    fprintf(stderr, "  --k <realtype>            reaction rate\n");
+    fprintf(stderr, "  --c <realtype>            advection speed\n");
+    fprintf(stderr, "  --rtol <realtype>         relative tolerance\n");
+    fprintf(stderr, "  --atol <realtype>         absolute tolerance\n");
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+}
diff --git a/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.hpp b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.hpp
new file mode 100644
index 0000000000..cb0dceea64
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.hpp
@@ -0,0 +1,171 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Daniel R. Reynolds @ SMU
+ *                David J. Gardner, Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * ---------------------------------------------------------------------------*/
+
+#ifndef ADVECTION_REACTION_3D_HPP
+#define ADVECTION_REACTION_3D_HPP
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <mpi.h>
+
+#include <sundials/sundials_context.h>
+#include <nvector/nvector_mpiplusx.h>
+#include "nvector/nvector_kokkos.hpp"
+#include "check_retval.h"
+#include "ParallelGrid.hpp"
+
+/* Set SUNDIALS Kokkos vector shortcut */
+using SUNVector = sundials::kokkos::Vector<ExecSpace>;
+
+using sundials_tools::ParallelGrid;
+using sundials_tools::BoundaryType;
+using sundials_tools::StencilType;
+using std::string;
+
+/* Maximum size of output directory string */
+constexpr int MXSTR = 2048;
+
+/*
+ * Data structure for problem options
+ */
+
+struct UserOptions
+{
+  int      npxyz[3]; /* number of processors in x,y,z */
+  sunindextype npts; /* number of spatial mesh points */
+  realtype t0;       /* initial time                  */
+  realtype tf;       /* final time                    */
+  realtype rtol;     /* relative tolerance            */
+  realtype atol;     /* absolute tolerance            */
+  int      order;    /* method order                  */
+  string   method;   /* method string                 */
+  string   nls;      /* nonlinear solver to use       */
+  int      fpaccel;  /* number of fixedpoint vectors  */
+  int      precond;  /* to precondition or not        */
+  int      fused;    /* use fused vector ops          */
+  int      nout;     /* number of outputs             */
+  int      save;     /* save solution to disk         */
+  char*    outputdir;
+};
+
+
+/*
+ * Data structure for problem specific data
+ */
+
+struct UserData
+{
+  SUNContext ctx;
+  SUNProfiler prof;
+
+  /* MPI data */
+  MPI_Comm    comm;
+  int         myid;
+  int         nprocs;
+  MPI_Request req[2];
+
+  /* Should reactions be added to the advection or not */
+  bool add_reactions;
+
+  /* File handles for output */
+  FILE*  TFID;     /* time output file pointer     */
+  FILE*  UFID;     /* solution output file pointer */
+  FILE*  VFID;
+  FILE*  WFID;
+
+  /* Solution masks */
+  N_Vector umask;
+  N_Vector vmask;
+  N_Vector wmask;
+
+  /* Problem parameters */
+  realtype  xmax; /* maximum x value              */
+  realtype  A;    /* concentration of species A   */
+  realtype  B;    /* w source rate                */
+  realtype  k1;   /* reaction rates               */
+  realtype  k2;
+  realtype  k3;
+  realtype  k4;
+  realtype  k5;
+  realtype  k6;
+  realtype  c;    /* advection coefficient        */
+
+  /* Parallel mesh */
+  ParallelGrid<sunindextype>* grid;
+
+  /* Count of implicit function evals by the task local nonlinear solver */
+  long int nnlfi;
+
+  /* Integrator options */
+  UserOptions* uopt;
+
+  /* Constructor that takes the context */
+  UserData(SUNContext ctx)
+    : ctx(ctx), umask(nullptr), vmask(nullptr), wmask(nullptr), uopt(nullptr),
+      TFID(nullptr), UFID(nullptr), VFID(nullptr), WFID(nullptr)
+  {
+    SUNContext_GetProfiler(ctx, &prof);
+  }
+
+  /* destructor frees the problem data */
+  ~UserData();
+};
+
+
+/*
+ * Functions to evolve the solution (defined by the drivers)
+ */
+
+/* function that does ARKStep setup and evolves the solution with a DIRK method */
+extern int EvolveProblemDIRK(N_Vector y, UserData* udata, UserOptions* uopt);
+
+/* function that does ARKStep setup and evolves the solution with an IMEX method */
+extern int EvolveProblemIMEX(N_Vector y, UserData* udata, UserOptions* uopt);
+
+/* function that does ERKStep setup and evolves the solution */
+extern int EvolveProblemExplicit(N_Vector y, UserData* udata, UserOptions* uopt);
+
+/* function that does CVODE BDF setup and evolves the solution */
+extern int EvolveProblemBDF(N_Vector y, UserData* udata, UserOptions* uopt);
+
+/* function that does CVODE Adams setup and evolves the solution */
+extern int EvolveProblemAdams(N_Vector y, UserData* udata, UserOptions* uopt);
+
+/* function that does IDA BDF setup and evolves the solution */
+extern int EvolveDAEProblem(N_Vector y, UserData* udata, UserOptions* uopt);
+
+
+/*
+ * Helper functions
+ */
+
+/* function to set initial condition */
+int SetIC(N_Vector y, UserData* udata);
+
+/* function to fill neighbor data */
+int FillSendBuffers(N_Vector y, UserData* udata);
+
+/* functions for processing command line args */
+int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
+                 SUNContext ctx);
+void InputError(char *name);
+int ComponentMask(N_Vector mask, const int component, const UserData* udata);
+
+/* function to write solution to disk */
+int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt);
+
+#endif
diff --git a/benchmarks/advection_reaction_3D/arkode_driver.cpp b/benchmarks/advection_reaction_3D/kokkos/arkode_driver.cpp
similarity index 98%
rename from benchmarks/advection_reaction_3D/arkode_driver.cpp
rename to benchmarks/advection_reaction_3D/kokkos/arkode_driver.cpp
index bbea07956a..e2cf1451e3 100644
--- a/benchmarks/advection_reaction_3D/arkode_driver.cpp
+++ b/benchmarks/advection_reaction_3D/kokkos/arkode_driver.cpp
@@ -588,10 +588,7 @@ int TaskLocalLSolve(N_Vector delta, void* arkode_mem)
   SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
 
   /* set up I - gamma*J and solve */
-  auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
-                                RAJA::RangeSegment(0, udata->grid->nyl),
-                                RAJA::RangeSegment(0, udata->grid->nzl));
-  retval = SolveReactionLinSys(z, delta, delta, gamma, range, udata);
+  retval = SolveReactionLinSys(z, delta, delta, gamma, udata);
 
 
   return(retval);
diff --git a/benchmarks/advection_reaction_3D/check_retval.h b/benchmarks/advection_reaction_3D/kokkos/check_retval.h
similarity index 99%
rename from benchmarks/advection_reaction_3D/check_retval.h
rename to benchmarks/advection_reaction_3D/kokkos/check_retval.h
index 31a4fa5922..887b7cea5d 100644
--- a/benchmarks/advection_reaction_3D/check_retval.h
+++ b/benchmarks/advection_reaction_3D/kokkos/check_retval.h
@@ -54,4 +54,4 @@ static int check_retval(void *returnvalue, const char *funcname, int opt, int my
   return(0);
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/benchmarks/advection_reaction_3D/cvode_driver.cpp b/benchmarks/advection_reaction_3D/kokkos/cvode_driver.cpp
similarity index 100%
rename from benchmarks/advection_reaction_3D/cvode_driver.cpp
rename to benchmarks/advection_reaction_3D/kokkos/cvode_driver.cpp
diff --git a/benchmarks/advection_reaction_3D/ida_driver.cpp b/benchmarks/advection_reaction_3D/kokkos/ida_driver.cpp
similarity index 100%
rename from benchmarks/advection_reaction_3D/ida_driver.cpp
rename to benchmarks/advection_reaction_3D/kokkos/ida_driver.cpp
diff --git a/benchmarks/advection_reaction_3D/kokkos/rhs3D.hpp b/benchmarks/advection_reaction_3D/kokkos/rhs3D.hpp
new file mode 100644
index 0000000000..34698146ab
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/rhs3D.hpp
@@ -0,0 +1,540 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Daniel R. Reynolds @ SMU
+ *                David J. Gardner, Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------*/
+
+#ifndef ADVECTION_REACTION_3D_RHS_HPP
+#define ADVECTION_REACTION_3D_RHS_HPP
+
+#include "advection_reaction_3D.hpp"
+
+/* --------------------------------------------------------------
+ * Right hand side (RHS) and residual functions
+ * --------------------------------------------------------------*/
+
+/* Compute the advection term f(t,y) = -c (grad * y). This is done using
+   upwind 1st order finite differences.  At present, only periodic boudary
+   conditions are supported, which are handled via MPI's Cartesian
+   communicator (even for serial runs). */
+static int Advection(realtype t, N_Vector y, N_Vector ydot, void* user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* set variable shortcuts */
+  const int      nxl = udata->grid->nxl;
+  const int      nyl = udata->grid->nyl;
+  const int      nzl = udata->grid->nzl;
+  const int      dof = udata->grid->dof;
+  const realtype c   = udata->c;
+  const realtype cx  = -c / udata->grid->dx;
+  const realtype cy  = -c / udata->grid->dy;
+  const realtype cz  = -c / udata->grid->dz;
+
+  /* local variables */
+  int retval;
+
+  /* fill send buffers and begin exchanging boundary information */
+  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
+  retval = FillSendBuffers(y, udata);
+  if (check_retval(&retval, "FillSendBuffers", 1, udata->myid))
+    return(-1);
+  retval = udata->grid->ExchangeStart();
+  if (check_retval(&retval, "ExchangeStart", 1, udata->myid))
+    return(-1);
+  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
+
+  /* set output to zero */
+  N_VConst(0.0, ydot);
+
+  /* create 4D views of the state and RHS vectors */
+  Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+  Vec4D dYview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(ydot)), nxl, nyl, nzl, dof);
+
+  /* iterate over domain interior, computing advection */
+  if (c > 0.0)
+  {
+    /* flow moving in the positive x,y,z direction */
+    Kokkos::parallel_for("AdvectionInteriorRight",
+                         Range3D({1,1,1},{nxl,nyl,nzl}),
+                         KOKKOS_LAMBDA (int i, int j, int k)
+    {
+      const realtype u_ijk = Yview(i,j,k,0);
+      const realtype v_ijk = Yview(i,j,k,1);
+      const realtype w_ijk = Yview(i,j,k,2);
+
+      // grad * u
+      dYview(i,j,k,0)  = cz * (u_ijk - Yview(i,j,k-1,0)); // du/dz
+      dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j-1,k,0)); // du/dy
+      dYview(i,j,k,0) += cx * (u_ijk - Yview(i-1,j,k,0)); // du/dx
+
+      // grad * v
+      dYview(i,j,k,1)  = cz * (v_ijk - Yview(i,j,k-1,1)); // dv/dz
+      dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j-1,k,1)); // dv/dy
+      dYview(i,j,k,1) += cx * (v_ijk - Yview(i-1,j,k,1)); // dv/dx
+
+      // grad * w
+      dYview(i,j,k,2)  = cz * (w_ijk - Yview(i,j,k-1,2)); // dw/dz
+      dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j-1,k,2)); // dw/dy
+      dYview(i,j,k,2) += cx * (w_ijk - Yview(i-1,j,k,2)); // dw/dx
+    });
+  }
+  else if (c < 0.0)
+  {
+    /* flow moving in the negative x,y,z direction */
+    Kokkos::parallel_for("AdvectionInteriorLeft",
+                         Range3D({0,0,0},{nxl-1,nyl-1,nzl-1}),
+                         KOKKOS_LAMBDA (int i, int j, int k)
+    {
+      const realtype u_ijk = Yview(i,j,k,0);
+      const realtype v_ijk = Yview(i,j,k,1);
+      const realtype w_ijk = Yview(i,j,k,2);
+
+      // grad * u
+      dYview(i,j,k,0)  = cz * (Yview(i,j,k+1,0) - u_ijk); // du/dz
+      dYview(i,j,k,0) += cy * (Yview(i,j+1,k,0) - u_ijk); // du/dy
+      dYview(i,j,k,0) += cx * (Yview(i+1,j,k,0) - u_ijk); // du/dx
+
+      // grad * v
+      dYview(i,j,k,1)  = cz * (Yview(i,j,k+1,1) - v_ijk); // dv/dz
+      dYview(i,j,k,1) += cy * (Yview(i,j+1,k,1) - v_ijk); // dv/dy
+      dYview(i,j,k,1) += cx * (Yview(i+1,j,k,1) - v_ijk); // dv/dx
+
+      // grad * w
+      dYview(i,j,k,2)  = cz * (Yview(i,j,k+1,2) - w_ijk); // dw/dz
+      dYview(i,j,k,2) += cy * (Yview(i,j+1,k,2) - w_ijk); // dw/dy
+      dYview(i,j,k,2) += cx * (Yview(i+1,j,k,2) - w_ijk); // dw/dx
+    });
+  }
+
+  /* finish exchanging boundary information */
+  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
+  retval = udata->grid->ExchangeEnd();
+  if (check_retval(&retval, "ExchangeEnd", 1, udata->myid))
+    return(-1);
+  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
+
+  /* compute advection at process boundaries */
+  if (c > 0.0)
+  {
+    /* Flow moving in the positive x,y,z direction:
+       boundaries are west face, south face, and back face */
+
+    /*   Create 4D views of receive buffers */
+    Vec4D Wrecv(udata->grid->GetRecvView("WEST"),  1, nyl, nzl, dof);
+    Vec4D Srecv(udata->grid->GetRecvView("SOUTH"), nxl, 1, nzl, dof);
+    Vec4D Brecv(udata->grid->GetRecvView("BACK"),  nxl, nyl, 1, dof);
+
+    /*   Perform calculations on each "lower" face */
+    Kokkos::parallel_for("AdvectionBoundaryWest",
+                         Range3D({0,0,0},{nyl,nzl,dof}),
+                         KOKKOS_LAMBDA (int j, int k, int l)
+    {
+      const int i = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,0,k,l);
+      const realtype YBack  = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,0,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - Wrecv(0,j,k,l)); // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - YSouth);         // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - YBack);          // d/dz
+    });
+    Kokkos::parallel_for("AdvectionBoundarySouth",
+                         Range3D({0,0,0},{nxl,nzl,dof}),
+                         KOKKOS_LAMBDA (int i, int k, int l)
+    {
+      const int j = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YWest = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(0,j,k,l);
+      const realtype YBack = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,0,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - YWest);          // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - Srecv(i,0,k,l)); // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - YBack);          // d/dz
+    });
+    Kokkos::parallel_for("AdvectionBoundaryBack",
+                         Range3D({0,0,0},{nxl,nyl,dof}),
+                         KOKKOS_LAMBDA (int i, int j, int l)
+    {
+      const int k = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YWest  = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(0,j,k,l);
+      const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,0,k,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - YWest);          // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - YSouth);         // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - Brecv(i,j,0,l)); // d/dz
+    });
+
+  }
+  else if (c < 0.0)
+  {
+
+    /* Flow moving in the negative x,y,z direction:
+       boundaries are east face, north face, and front face */
+
+    /*   Create 4D views of receive buffers */
+    Vec4D Erecv(udata->grid->GetRecvView("EAST"),  1, nyl, nzl, dof);
+    Vec4D Nrecv(udata->grid->GetRecvView("NORTH"), nxl, 1, nzl, dof);
+    Vec4D Frecv(udata->grid->GetRecvView("FRONT"), nxl, nyl, 1, dof);
+
+    /*   Perform calculations on each "upper" face */
+    Kokkos::parallel_for("AdvectionBoundaryEast",
+                         Range3D({0,0,0},{nyl,nzl,dof}),
+                         KOKKOS_LAMBDA (int j, int k, int l)
+    {
+      const int i = nxl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,0,k,l);
+      const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,0,l);
+      dYview(i,j,k,l)  = cx * (Erecv(0,j,k,l) - Yijkl); // d/dx
+      dYview(i,j,k,l) += cy * (YNorth - Yijkl);         // d/dy
+      dYview(i,j,k,l) += cz * (YFront - Yijkl);         // d/dz
+    });
+    Kokkos::parallel_for("AdvectionBoundaryNorth",
+                         Range3D({0,0,0},{nxl,nzl,dof}),
+                         KOKKOS_LAMBDA (int i, int k, int l)
+    {
+      const int j = nyl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YEast  = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(0,j,k,l);
+      const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,0,l);
+      dYview(i,j,k,l)  = cx * (YEast - Yijkl);          // d/dx
+      dYview(i,j,k,l) += cy * (Nrecv(i,0,k,l) - Yijkl); // d/dy
+      dYview(i,j,k,l) += cz * (YFront - Yijkl);         // d/dz
+    });
+    Kokkos::parallel_for("AdvectionBoundaryFront",
+                         Range3D({0,0,0},{nxl,nyl,dof}),
+                         KOKKOS_LAMBDA (int i, int j, int l)
+    {
+      const int k = nzl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YEast  = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(0,j,k,l);
+      const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,0,k,l);
+      dYview(i,j,k,l)  = cx * (YEast - Yijkl);          // d/dx
+      dYview(i,j,k,l) += cy * (YNorth - Yijkl);         // d/dy
+      dYview(i,j,k,l) += cz * (Frecv(i,j,0,l) - Yijkl); // d/dz
+    });
+  }
+
+  /* return success */
+  return(0);
+}
+
+
+/* Compute the reaction term g(t,y). */
+static int Reaction(realtype t, N_Vector y, N_Vector ydot, void* user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* set variable shortcuts */
+  const realtype A  = udata->A;
+  const realtype B  = udata->B;
+  const realtype k1 = udata->k1;
+  const realtype k2 = udata->k2;
+  const realtype k3 = udata->k3;
+  const realtype k4 = udata->k4;
+  const realtype k5 = udata->k5;
+  const realtype k6 = udata->k6;
+  const int     nxl = udata->grid->nxl;
+  const int     nyl = udata->grid->nyl;
+  const int     nzl = udata->grid->nzl;
+  const int     dof = udata->grid->dof;
+
+  /* Zero output if not adding reactions to existing RHS */
+  if (!udata->add_reactions)
+    N_VConst(0.0, ydot);
+
+  /* create 4D views of state and RHS vectors */
+  Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+  Vec4D dYview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(ydot)), nxl, nyl, nzl, dof);
+
+  /* add reaction terms to RHS */
+  Kokkos::parallel_for("ReactionRHS",
+                       Range3D({0,0,0},{nxl,nyl,nzl}),
+                       KOKKOS_LAMBDA (int i, int j, int k)
+  {
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+    dYview(i,j,k,0) += k1 * A - k2 * w * u + k3 * u * u * v - k4 * u;
+    dYview(i,j,k,1) += k2 * w * u - k3 * u * u * v;
+    dYview(i,j,k,2) += -k2 * w * u + k5 * B - k6 * w;
+  });
+
+  /* return success */
+  return(0);
+}
+
+
+/* Compute the RHS as h(t,y) = f(t,y) + g(t,y). */
+static int AdvectionReaction(realtype t, N_Vector y, N_Vector ydot,
+                             void *user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+  int retval;
+
+  /* NOTE: The order in which Advection and Reaction are called
+           is critical here. Advection must be computed first. */
+  retval = Advection(t, y, ydot, user_data);
+  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
+
+  retval = Reaction(t, y, ydot, user_data);
+  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
+
+  /* return success */
+  return(0);
+}
+
+/* Compute the residual F(t,y,y') = ydot - h(t,y) = 0. */
+static int AdvectionReactionResidual(realtype t, N_Vector y, N_Vector ydot,
+                                     N_Vector F, void *user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+  int retval;
+
+  /* NOTE: The order in which Advection and Reaction are called
+           is critical here. Advection must be computed first. */
+  retval = Advection(t, y, F, user_data); /* F = -c y_x */
+  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
+
+  retval = Reaction(t, y, F, user_data);  /* F = -c y_x + g(t,y) */
+  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
+
+  /* F = ydot - h(t,y) = ydot + c y_x - g(t,y) */
+  N_VLinearSum(1.0, ydot, -1.0, F, F);
+
+  /* return success */
+  return(0);
+}
+
+/* --------------------------------------------------------------
+ * Linear system and Jacobian functions
+ * --------------------------------------------------------------*/
+
+/* Solve the linear systems Ax = b where A = I - gamma*dg/dy.
+   When using a fully implicit method, we are approximating
+   dh/dy as dg/dy. */
+static int SolveReactionLinSys(N_Vector y, N_Vector x, N_Vector b,
+                               const realtype gamma, UserData* udata)
+{
+  /* set variable shortcuts */
+  const int dof = udata->grid->dof;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const realtype k2  = udata->k2;
+  const realtype k3  = udata->k3;
+  const realtype k4  = udata->k4;
+  const realtype k6  = udata->k6;
+
+  /* create 4D views of state, RHS and solution vectors */
+  Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+  Vec4D Bview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(b)), nxl, nyl, nzl, dof);
+  Vec4D Xview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(x)), nxl, nyl, nzl, dof);
+
+  /* solve reaction linear system */
+  Kokkos::parallel_for("SolveReactionLinSys",
+                       Range3D({0,0,0},{nxl,nyl,nzl}),
+                       KOKKOS_LAMBDA (int i, int j, int k)
+  {
+
+    /* shortcuts to u, v, w for the block */
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+
+    //
+    // compute A = I - gamma*(dg/dy)
+    //
+
+    /* 1st row: u, v, w */
+    const realtype A0 = 1. - gamma * (-k2 * w + 2.0 * k3 * u * v - k4);
+    const realtype A1 = -gamma * (k3 * u * u);
+    const realtype A2 = -gamma * (-k2 * u);
+
+    /* 2nd row: u, v, w */
+    const realtype A3 = -gamma * (k2 * w - 2.0 * k3 * u * v);
+    const realtype A4 = 1. - gamma * (-k3 * u * u);
+    const realtype A5 = -gamma * (k2 * u);
+
+    /* 3rd row: u, v, w */
+    const realtype A6 = -gamma * (-k2 * w);
+    const realtype A7 =  0.0;
+    const realtype A8 = 1. - gamma * (-k2 * u - k6);
+
+    //
+    // compute x = A^{-1}*b
+    //
+
+    const realtype scratch_0 = A4*A8;
+    const realtype scratch_1 = A1*A5;
+    const realtype scratch_2 = A2*A7;
+    const realtype scratch_3 = A5*A7;
+    const realtype scratch_4 = A1*A8;
+    const realtype scratch_5 = A2*A4;
+    const realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
+    const realtype scratch_7 = A2*A3;
+    const realtype scratch_8 = A6*Bview(i,j,k,0);
+    const realtype scratch_9 = A2*A6;
+    const realtype scratch_10 = A3*Bview(i,j,k,0);
+    const realtype scratch_11 = 1.0/A0;
+    const realtype scratch_12 = A1*scratch_11;
+    const realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
+
+    Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3)
+                               + Bview(i,j,k,1)*(scratch_2 - scratch_4)
+                               + Bview(i,j,k,2)*(scratch_1 - scratch_5));
+    Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5)
+                               + Bview(i,j,k,1)*(A0*A8 - scratch_9)
+                               + A5*scratch_8 - A8*scratch_10 );
+    Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8
+                     + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) /
+                     (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
+
+  });
+
+  return(0);
+}
+
+/* Solve the linear systems Ax = b where A = -dg/dy + gamma.
+   We are approximating dh/dy as dg/dy. */
+static int SolveReactionLinSysRes(N_Vector y, N_Vector x, N_Vector b,
+                                  const realtype gamma, UserData* udata)
+{
+  /* set variable shortcuts */
+  const int dof = udata->grid->dof;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const realtype k2  = udata->k2;
+  const realtype k3  = udata->k3;
+  const realtype k4  = udata->k4;
+  const realtype k6  = udata->k6;
+
+  /* create 4D views of state, RHS and solution vectors */
+  Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+  Vec4D Bview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(b)), nxl, nyl, nzl, dof);
+  Vec4D Xview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(x)), nxl, nyl, nzl, dof);
+
+  /* solve reaction linear system */
+  Kokkos::parallel_for("SolveReactionLinSys",
+                       Range3D({0,0,0},{nxl,nyl,nzl}),
+                       KOKKOS_LAMBDA (int i, int j, int k)
+  {
+
+    /* shortcuts to u, v, w for the block */
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+
+    //
+    // compute A = -dg/dy + gamma*diag(df/dydot)
+    // where diag(df/dydot) is approximated as
+    // diag([udot, vdot, wdot])
+    //
+
+    /* 1st row: u, v, w */
+    const realtype A0 = -(-k2 * w + 2.0 * k3 * u * v - k4) + gamma;
+    const realtype A1 = -(k3 * u * u);
+    const realtype A2 = -(-k2 * u);
+
+    /* 2nd row: u, v, w */
+    const realtype A3 = -(k2 * w - 2.0 * k3 * u * v);
+    const realtype A4 = -(-k3 * u * u) + gamma;
+    const realtype A5 = -(k2 * u);
+
+    /* 3rd row: u, v, w */
+    const realtype A6 = -(-k2 * w);
+    const realtype A7 =  0.0;
+    const realtype A8 = -(-k2 * u - k6) + gamma;
+
+    //
+    // compute x = A^{-1}*b
+    //
+
+    const realtype scratch_0 = A4*A8;
+    const realtype scratch_1 = A1*A5;
+    const realtype scratch_2 = A2*A7;
+    const realtype scratch_3 = A5*A7;
+    const realtype scratch_4 = A1*A8;
+    const realtype scratch_5 = A2*A4;
+    const realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
+    const realtype scratch_7 = A2*A3;
+    const realtype scratch_8 = A6*Bview(i,j,k,0);
+    const realtype scratch_9 = A2*A6;
+    const realtype scratch_10 = A3*Bview(i,j,k,0);
+    const realtype scratch_11 = 1.0/A0;
+    const realtype scratch_12 = A1*scratch_11;
+    const realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
+
+    Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3)
+                               + Bview(i,j,k,1)*(scratch_2 - scratch_4)
+                               + Bview(i,j,k,2)*(scratch_1 - scratch_5));
+    Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5)
+                               + Bview(i,j,k,1)*(A0*A8 - scratch_9)
+                               + A5*scratch_8 - A8*scratch_10 );
+    Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8
+                     + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) /
+                     (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
+
+  });
+
+  return(0);
+}
+
+
+/* --------------------------------------------------------------
+ * Preconditioner functions
+ * --------------------------------------------------------------*/
+
+/* Solves Pz = r where P = I - gamma * dg/dy */
+static int PSolve(realtype t, N_Vector y, N_Vector ydot, N_Vector r,
+                  N_Vector z, realtype gamma, realtype delta, int lr,
+                  void *user_data)
+{
+  /* local variables */
+  UserData* udata = (UserData*) user_data;
+  int       retval;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* solve the task-local linear system Pz = r */
+  retval = SolveReactionLinSys(y, z, r, gamma, udata);
+
+  return(retval);
+}
+
+/* Solves Pz = r where P = -dg/dy + gamma */
+static int PSolveRes(realtype t, N_Vector y, N_Vector ydot, N_Vector F,
+                     N_Vector r, N_Vector z, realtype cj, realtype delta,
+                     void *user_data)
+{
+  /* local variables */
+  UserData* udata = (UserData*) user_data;
+  int       retval;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* solve the task-local linear system Pz = r */
+  retval = SolveReactionLinSysRes(y, z, r, cj, udata);
+
+  return(retval);
+}
+
+
+#endif
diff --git a/benchmarks/advection_reaction_3D/raja/CMakeLists.txt b/benchmarks/advection_reaction_3D/raja/CMakeLists.txt
new file mode 100644
index 0000000000..0bae78c562
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/CMakeLists.txt
@@ -0,0 +1,151 @@
+# ---------------------------------------------------------------
+# Programmer(s):  Cody J. Balos @ LLNL
+#                 Daniel R. Reynolds @ SMU
+# ---------------------------------------------------------------
+# SUNDIALS Copyright Start
+# Copyright (c) 2002-2023, Lawrence Livermore National Security
+# and Southern Methodist University.
+# All rights reserved.
+#
+# See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# SUNDIALS Copyright End
+# ---------------------------------------------------------------
+
+if(BUILD_ARKODE AND BUILD_CVODE AND BUILD_IDA)
+
+  if((RAJA_BACKENDS MATCHES "TARGET_OPENMP") OR (RAJA_BACKENDS MATCHES "OPENMP"))
+    set(OTHER_LIBS OpenMP::OpenMP_CXX)
+  endif()
+
+  # ----------------------------------------------------------------------------
+  # MPI only
+  # ----------------------------------------------------------------------------
+
+  add_executable(advection_reaction_3D_raja
+    advection_reaction_3D.cpp
+    arkode_driver.cpp
+    cvode_driver.cpp
+    ida_driver.cpp
+    rhs3D.hpp
+    ParallelGrid.hpp
+    check_retval.h
+    backends.hpp)
+
+  # ensure the linker language is reset to CXX
+  set_target_properties(advection_reaction_3D_raja PROPERTIES LINKER_LANGUAGE CXX)
+
+  target_include_directories(advection_reaction_3D_raja
+    PRIVATE
+    ${PROJECT_SOURCE_DIR}/utilities
+    ${MPI_CXX_INCLUDE_DIRS})
+
+  target_link_libraries(advection_reaction_3D_raja
+    PRIVATE
+    sundials_arkode
+    sundials_cvode
+    sundials_ida
+    sundials_nvecmpiplusx
+    sundials_nvecserial
+    RAJA
+    ${MPI_CXX_LIBRARIES}
+    ${OTHER_LIBS})
+
+  install(TARGETS advection_reaction_3D_raja
+    DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja")
+
+  install(FILES README.md ../scripts/compare_error.py ../scripts/compute_error.py ../scripts/pickle_solution_output.py
+    DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja")
+
+  # ----------------------------------------------------------------------------
+  # MPI + CUDA
+  # ----------------------------------------------------------------------------
+
+  if(BUILD_NVECTOR_CUDA)
+
+    set_source_files_properties(advection_reaction_3D.cpp
+      PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(arkode_driver.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(cvode_driver.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(ida_driver.cpp PROPERTIES LANGUAGE CUDA)
+
+    add_executable(advection_reaction_3D_raja_mpicuda
+      advection_reaction_3D.cpp
+      arkode_driver.cpp
+      cvode_driver.cpp
+      ida_driver.cpp
+      rhs3D.hpp
+      ParallelGrid.hpp
+      check_retval.h
+      backends.hpp)
+
+    # ensure the linker language is reset to CXX
+    set_target_properties(advection_reaction_3D_raja_mpicuda
+      PROPERTIES LINKER_LANGUAGE CXX)
+
+    target_include_directories(advection_reaction_3D_raja_mpicuda
+      PRIVATE
+      ${PROJECT_SOURCE_DIR}/utilities
+      ${MPI_CXX_INCLUDE_DIRS})
+
+    target_link_libraries(advection_reaction_3D_raja_mpicuda
+      PRIVATE
+      sundials_arkode
+      sundials_cvode
+      sundials_ida
+      sundials_nvecmpiplusx
+      sundials_nveccuda
+      RAJA
+      ${MPI_CXX_LIBRARIES}
+      ${OTHER_LIBS})
+
+    target_compile_definitions(advection_reaction_3D_raja_mpicuda PRIVATE USE_CUDA_NVEC)
+
+    install(TARGETS advection_reaction_3D_raja_mpicuda
+      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja")
+
+  endif()
+
+  # ----------------------------------------------------------------------------
+  # MPI + HIP
+  # ----------------------------------------------------------------------------
+
+  if(BUILD_NVECTOR_HIP)
+
+    add_executable(advection_reaction_3D_raja_mpihip
+      advection_reaction_3D.cpp
+      advection_reaction_3D.hpp
+      arkode_driver.cpp
+      cvode_driver.cpp
+      ida_driver.cpp
+      rhs3D.hpp
+      ParallelGrid.hpp
+      check_retval.h
+      backends.hpp)
+
+    target_include_directories(advection_reaction_3D_raja_mpihip
+      PRIVATE
+      ${PROJECT_SOURCE_DIR}/utilities
+      ${MPI_CXX_INCLUDE_DIRS})
+
+    target_link_libraries(advection_reaction_3D_raja_mpihip
+      PRIVATE
+      sundials_arkode
+      sundials_cvode
+      sundials_ida
+      sundials_nvecmpiplusx
+      sundials_nvechip
+      RAJA
+      hip::device
+      ${MPI_CXX_LIBRARIES}
+      ${OTHER_LIBS})
+
+    target_compile_definitions(advection_reaction_3D_raja_mpihip PRIVATE USE_HIP_NVEC)
+
+    install(TARGETS advection_reaction_3D_raja_mpihip
+      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja")
+
+  endif()
+
+endif()
diff --git a/benchmarks/advection_reaction_3D/ParallelGrid.hpp b/benchmarks/advection_reaction_3D/raja/ParallelGrid.hpp
similarity index 56%
rename from benchmarks/advection_reaction_3D/ParallelGrid.hpp
rename to benchmarks/advection_reaction_3D/raja/ParallelGrid.hpp
index abd6185810..1592a27806 100644
--- a/benchmarks/advection_reaction_3D/ParallelGrid.hpp
+++ b/benchmarks/advection_reaction_3D/raja/ParallelGrid.hpp
@@ -1,5 +1,6 @@
 /* -----------------------------------------------------------------------------
  * Programmer(s): Cody J. Balos @ LLNL
+ *                Daniel R. Reynolds @ SMU
  * -----------------------------------------------------------------------------
  * SUNDIALS Copyright Start
  * Copyright (c) 2002-2023, Lawrence Livermore National Security
@@ -40,24 +41,26 @@ enum class StencilType
   UPWIND
 };
 
-template<typename REAL, typename GLOBALINT, int NDIMS>
+template<typename REAL, typename GLOBALINT>
 class ParallelGrid
 {
 public:
   // Constructor that creates a new ParallelGrid object.
   // [in] - the memory helper to use for allocating the MPI buffers
   // [in,out] comm - on input, the overal MPI communicator, on output, the cartesian communicator
-  // [in] a[] - an array of length NDIMS which defines the domain [a,b]
-  // [in] b[] - an array of length NDIMS which defines the domain [a,b]
-  // [in] npts[] - an array of length NDIMS which defines the number of mesh points in each dimension
+  // [in] a[] - an array of length 3 which defines the domain [a,b]
+  // [in] b[] - an array of length 3 which defines the domain [a,b]
+  // [in] npts[] - an array of length 3 which defines the number of mesh points in each dimension
   // [in] dof - the number of degrees of freedom in each dimension
   // [in] bc - the type of boundary conditions (see BoundaryType)
   // [in] st - the stencil to use (see StencilType)
   // [in] width - the stencil width; defaults to 1
   // [in] npxyz - the number of processors in each dimension; defaults to 0 which means MPI will choose
   // [in] reorder - should MPI_Cart_create do process reordering to optimize or not; defaults to false (some MPI implementations ignore this)
-  ParallelGrid(SUNMemoryHelper memhelp, MPI_Comm* comm, const REAL a[], const REAL b[], const GLOBALINT npts[], int dof,
-               BoundaryType bc, StencilType st, int width = 1, const int npxyz[] = nullptr, bool reorder = false)
+  ParallelGrid(SUNMemoryHelper memhelp, MPI_Comm* comm, const REAL a[], const REAL b[],
+               const GLOBALINT npts[], int dof, BoundaryType bc, StencilType st,
+               const REAL c, int width = 1, const int npxyz[] = nullptr,
+               bool reorder = false)
     : nx(1), ny(1), nz(1),
       nxl(1), nyl(1), nzl(1),
       npx(1), npy(1), npz(1),
@@ -66,35 +69,40 @@ class ParallelGrid
       bx(0.0), by(0.0), bz(0.0),
       dof(dof), dims{0,0,0}, coords{0,0,0},
       bc(bc), st(st), width(width),
+      upwindRight(true),
       memhelp(memhelp)
-  {
-    static_assert((NDIMS >= 1 && NDIMS <= 3), "ParallelGrid NDIMS must be 1, 2 or 3");
 
-    int retval, nprocs;
-    int periods[] = {0, 0, 0};
+  {
+    assert(st == StencilType::UPWIND);
 
+    /* Set up MPI Cartesian communicator */
     if (npxyz)
     {
       dims[0] = npxyz[0];
-      if (NDIMS >= 2) dims[1] = npxyz[1];
-      if (NDIMS == 3) dims[2] = npxyz[2];
+      dims[1] = npxyz[1];
+      dims[2] = npxyz[2];
     }
 
+    int retval, nprocs;
     MPI_Comm_size(*comm, &nprocs);
-    retval = MPI_Dims_create(nprocs, NDIMS, dims);
+    retval = MPI_Dims_create(nprocs, 3, dims);
     assert(retval == MPI_SUCCESS);
 
-    periods[0] = bc == BoundaryType::PERIODIC;
-    periods[1] = bc == BoundaryType::PERIODIC;
-    periods[2] = bc == BoundaryType::PERIODIC;
-    retval = MPI_Cart_create(*comm, NDIMS, dims, periods, reorder, comm);
+    int periods[] = { bc == BoundaryType::PERIODIC,
+                      bc == BoundaryType::PERIODIC,
+                      bc == BoundaryType::PERIODIC };
+    retval = MPI_Cart_create(*comm, 3, dims, periods, reorder, comm);
     assert(retval == MPI_SUCCESS);
 
-    retval = MPI_Cart_get(*comm, NDIMS, dims, periods, coords);
+    retval = MPI_Cart_get(*comm, 3, dims, periods, coords);
     assert(retval == MPI_SUCCESS);
 
     cart_comm = *comm;
 
+    /* Set upwinding direction */
+    upwindRight = (c > 0.0);
+
+    /* Set up information for the first spatial dimension */
     npx = dims[0];
     nx  = npts[0];
     ax  = a[0];
@@ -103,251 +111,235 @@ class ParallelGrid
     int is = nx*(coords[0])/npx;
     int ie = nx*(coords[0]+1)/npx-1;
     nxl = ie-is+1;
-
     neq = dof * nxl;
 
-    if (NDIMS >= 2)
-    {
-      npy = dims[1];
-      ny  = npts[1];
-      ay  = a[1];
-      by  = b[1];
-      dy  = (by-ay) / (REAL) ny;
-      int js = ny*(coords[1])/npy;
-      int je = ny*(coords[1]+1)/npy-1;
-      nyl = je-js+1;
-
-      neq *= nyl;
-    }
-
-    if (NDIMS == 3)
-    {
-      npz = dims[2];
-      nz  = npts[2];
-      az  = a[2];
-      bz  = b[2];
-      dz  = (bz-az) / (REAL) nz;
-      int ks = nz*(coords[2])/npz;
-      int ke = nz*(coords[2]+1)/npz-1;
-      nzl = ke-ks+1;
-
-      neq *= nzl;
-    }
-
+    /* Set up information for the second spatial dimension */
+    npy = dims[1];
+    ny  = npts[1];
+    ay  = a[1];
+    by  = b[1];
+    dy  = (by-ay) / (REAL) ny;
+    int js = ny*(coords[1])/npy;
+    int je = ny*(coords[1]+1)/npy-1;
+    nyl = je-js+1;
+    neq *= nyl;
+
+    /* Set up information for the third spatial dimension */
+    npz = dims[2];
+    nz  = npts[2];
+    az  = a[2];
+    bz  = b[2];
+    dz  = (bz-az) / (REAL) nz;
+    int ks = nz*(coords[2])/npz;
+    int ke = nz*(coords[2]+1)/npz-1;
+    nzl = ke-ks+1;
+    neq *= nzl;
+
+    /* Allocate buffers for nearest-neighbor exchange */
     if (st == StencilType::UPWIND)
       AllocateBuffersUpwind();
 
   }
 
   // TODO:
-  //  - does not take advantage of upwind scheme to reduce communications and memory
   //  - support non-periodic boundary conditions
   // For all faces where neighbors exist: determine neighbor process indices.
   // For all faces: allocate exchange buffers.
   void AllocateBuffersUpwind()
   {
-    int retval = 0;
-    int nbcoords[] = {0, 0, 0};
 
-    SUNMemoryHelper_Alloc(memhelp, &Wrecv_, sizeof(REAL)*dof*width*nyl*nzl,
-                          memoryType(), nullptr);
-    SUNMemoryHelper_Alloc(memhelp, &Wsend_, sizeof(REAL)*dof*width*nyl*nzl,
-                          memoryType(), nullptr);
+    /* Allocate send/receive buffers and determine ID for communication West */
+    if (upwindRight)
+      SUNMemoryHelper_Alloc(memhelp, &Wrecv_, sizeof(REAL)*dof*width*nyl*nzl,
+                            memoryType(), nullptr);
+    else
+      SUNMemoryHelper_Alloc(memhelp, &Wsend_, sizeof(REAL)*dof*width*nyl*nzl,
+                            memoryType(), nullptr);
     ipW = MPI_PROC_NULL;
     if ((coords[0] > 0) || (bc == BoundaryType::PERIODIC)) {
-      nbcoords[0] = coords[0]-1;
-      nbcoords[1] = coords[1];
-      nbcoords[2] = coords[2];
-      retval = MPI_Cart_rank(cart_comm, nbcoords, &ipW);
+      int nbcoords[] = {coords[0]-1, coords[1], coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipW);
       assert(retval == MPI_SUCCESS);
     }
 
-    SUNMemoryHelper_Alloc(memhelp, &Erecv_, sizeof(REAL)*dof*width*nyl*nzl,
-                          memoryType(), nullptr);
-    SUNMemoryHelper_Alloc(memhelp, &Esend_, sizeof(REAL)*dof*width*nyl*nzl,
-                          memoryType(), nullptr);
+    /* Allocate send/receive buffers and determine ID for communication East */
+    if (upwindRight)
+      SUNMemoryHelper_Alloc(memhelp, &Esend_, sizeof(REAL)*dof*width*nyl*nzl,
+                            memoryType(), nullptr);
+    else
+      SUNMemoryHelper_Alloc(memhelp, &Erecv_, sizeof(REAL)*dof*width*nyl*nzl,
+                            memoryType(), nullptr);
     ipE = MPI_PROC_NULL;
     if ((coords[0] < dims[0]-1) || (bc == BoundaryType::PERIODIC)) {
-      nbcoords[0] = coords[0]+1;
-      nbcoords[1] = coords[1];
-      nbcoords[2] = coords[2];
-      retval = MPI_Cart_rank(cart_comm, nbcoords, &ipE);
+      int nbcoords[] = {coords[0]+1, coords[1], coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipE);
       assert(retval == MPI_SUCCESS);
     }
 
-    if (NDIMS >= 2)
-    {
+    /* Allocate send/receive buffers and determine ID for communication South */
+    if (upwindRight)
       SUNMemoryHelper_Alloc(memhelp, &Srecv_, sizeof(REAL)*dof*width*nxl*nzl,
                             memoryType(), nullptr);
+    else
       SUNMemoryHelper_Alloc(memhelp, &Ssend_, sizeof(REAL)*dof*width*nxl*nzl,
                             memoryType(), nullptr);
-      ipS = MPI_PROC_NULL;
-      if ((coords[1] > 0) || (bc == BoundaryType::PERIODIC)) {
-        nbcoords[0] = coords[0];
-        nbcoords[1] = coords[1]-1;
-        nbcoords[2] = coords[2];
-        retval = MPI_Cart_rank(cart_comm, nbcoords, &ipS);
-        assert(retval == MPI_SUCCESS);
-      }
+    ipS = MPI_PROC_NULL;
+    if ((coords[1] > 0) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1]-1, coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipS);
+      assert(retval == MPI_SUCCESS);
+    }
 
-      SUNMemoryHelper_Alloc(memhelp, &Nrecv_, sizeof(REAL)*dof*width*nxl*nzl,
-                            memoryType(), nullptr);
+    /* Allocate send/receive buffers and determine ID for communication North */
+    if (upwindRight)
       SUNMemoryHelper_Alloc(memhelp, &Nsend_, sizeof(REAL)*dof*width*nxl*nzl,
                             memoryType(), nullptr);
-      ipN = MPI_PROC_NULL;
-      if ((coords[1] < dims[1]-1) || (bc == BoundaryType::PERIODIC)) {
-        nbcoords[0] = coords[0];
-        nbcoords[1] = coords[1]+1;
-        nbcoords[2] = coords[2];
-        retval = MPI_Cart_rank(cart_comm, nbcoords, &ipN);
-        assert(retval == MPI_SUCCESS);
-      }
+    else
+      SUNMemoryHelper_Alloc(memhelp, &Nrecv_, sizeof(REAL)*dof*width*nxl*nzl,
+                            memoryType(), nullptr);
+    ipN = MPI_PROC_NULL;
+    if ((coords[1] < dims[1]-1) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1]+1, coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipN);
+      assert(retval == MPI_SUCCESS);
     }
 
-    if (NDIMS == 3)
-    {
+    /* Allocate send/receive buffers and determine ID for communication Back */
+    if (upwindRight)
       SUNMemoryHelper_Alloc(memhelp, &Brecv_, sizeof(REAL)*dof*width*nxl*nyl,
                             memoryType(), nullptr);
+    else
       SUNMemoryHelper_Alloc(memhelp, &Bsend_, sizeof(REAL)*dof*width*nxl*nyl,
                             memoryType(), nullptr);
-      ipB = MPI_PROC_NULL;
-      if ((coords[2] > 0) || (bc == BoundaryType::PERIODIC)) {
-        nbcoords[0] = coords[0];
-        nbcoords[1] = coords[1];
-        nbcoords[2] = coords[2]-1;
-        retval = MPI_Cart_rank(cart_comm, nbcoords, &ipB);
-        assert(retval == MPI_SUCCESS);
-      }
+    ipB = MPI_PROC_NULL;
+    if ((coords[2] > 0) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1], coords[2]-1};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipB);
+      assert(retval == MPI_SUCCESS);
+    }
 
-      SUNMemoryHelper_Alloc(memhelp, &Frecv_, sizeof(REAL)*dof*width*nxl*nyl,
-                            memoryType(), nullptr);
+    /* Allocate send/receive buffers and determine ID for communication Front */
+    if (upwindRight)
       SUNMemoryHelper_Alloc(memhelp, &Fsend_, sizeof(REAL)*dof*width*nxl*nyl,
                             memoryType(), nullptr);
-      ipF = MPI_PROC_NULL;
-      if ((coords[2] < dims[2]-1) || (bc == BoundaryType::PERIODIC)) {
-        nbcoords[0] = coords[0];
-        nbcoords[1] = coords[1];
-        nbcoords[2] = coords[2]+1;
-        retval = MPI_Cart_rank(cart_comm, nbcoords, &ipF);
-        assert(retval == MPI_SUCCESS);
-      }
+    else
+      SUNMemoryHelper_Alloc(memhelp, &Frecv_, sizeof(REAL)*dof*width*nxl*nyl,
+                            memoryType(), nullptr);
+    ipF = MPI_PROC_NULL;
+    if ((coords[2] < dims[2]-1) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1], coords[2]+1};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipF);
+      assert(retval == MPI_SUCCESS);
     }
 
   }
 
-  // TODO: this could be optimized for upwind
-  int ExchangeStart(std::function<void (REAL*,REAL*,REAL*,REAL*,REAL*,REAL*)> fill)
+  // Initiate non-blocking neighbor communication
+  int ExchangeStart()
   {
     int retval = 0;
+    nreq = 0;
 
     // Initialize all requests in array
     for (int i=0; i<12; i++)
       req[i] = MPI_REQUEST_NULL;
 
     // Open an Irecv buffer for each neighbor
-    if (ipW != MPI_PROC_NULL)
+    if ((ipW != MPI_PROC_NULL) && (upwindRight))
     {
-      retval = MPI_Irecv(getRecvBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW,
-                         1, cart_comm, req);
+      retval = MPI_Irecv(getRecvBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW,
+                         1, cart_comm, req+nreq);
       assert(retval == MPI_SUCCESS);
+      nreq++;
     }
 
-    if (ipE != MPI_PROC_NULL)
+    if ((ipE != MPI_PROC_NULL) && (!upwindRight))
     {
-      retval = MPI_Irecv(getRecvBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE,
-                         0, cart_comm, req+1);
+      retval = MPI_Irecv(getRecvBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE,
+                         0, cart_comm, req+nreq);
       assert(retval == MPI_SUCCESS);
+      nreq++;
     }
 
-    if (NDIMS >= 2)
+    if ((ipS != MPI_PROC_NULL) && (upwindRight))
     {
-      if (ipS != MPI_PROC_NULL)
-      {
-        retval = MPI_Irecv(getRecvBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS,
-                           3, cart_comm, req+2);
-        assert(retval == MPI_SUCCESS);
-      }
-
-      if (ipN != MPI_PROC_NULL)
-      {
-        retval = MPI_Irecv(getRecvBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN,
-                           2, cart_comm, req+3);
-        assert(retval == MPI_SUCCESS);
-      }
-    }
-
-    if (NDIMS >= 3)
+      retval = MPI_Irecv(getRecvBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS,
+                         3, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipN != MPI_PROC_NULL) && (!upwindRight))
+    {
+      retval = MPI_Irecv(getRecvBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN,
+                         2, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipB != MPI_PROC_NULL) && (upwindRight))
+    {
+      retval = MPI_Irecv(getRecvBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB,
+                         5, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipF != MPI_PROC_NULL) && (!upwindRight))
     {
-      if (ipB != MPI_PROC_NULL)
-      {
-        retval = MPI_Irecv(getRecvBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB,
-                           5, cart_comm, req+4);
-        assert(retval == MPI_SUCCESS);
-      }
-
-      if (ipF != MPI_PROC_NULL)
-      {
-        retval = MPI_Irecv(getRecvBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF,
-                           4, cart_comm, req+5);
-        assert(retval == MPI_SUCCESS);
-      }
-    }
-
-    // Call user lambda to fill the send buffers
-    fill(getSendBuffer("WEST"),
-         getSendBuffer("EAST"),
-         getSendBuffer("SOUTH"),
-         getSendBuffer("NORTH"),
-         getSendBuffer("BACK"),
-         getSendBuffer("FRONT"));
+      retval = MPI_Irecv(getRecvBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF,
+                         4, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
 
     // Send data to neighbors
-    if (ipW != MPI_PROC_NULL)
+    if ((ipW != MPI_PROC_NULL) && (!upwindRight))
     {
-      retval = MPI_Isend(getSendBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, 0,
-                         cart_comm, req+6);
+      retval = MPI_Isend(getSendBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, 0,
+                         cart_comm, req+nreq);
       assert(retval == MPI_SUCCESS);
+      nreq++;
     }
 
-    if (ipE != MPI_PROC_NULL)
+    if ((ipE != MPI_PROC_NULL) && (upwindRight))
     {
-      retval = MPI_Isend(getSendBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, 1,
-                         cart_comm, req+7);
+      retval = MPI_Isend(getSendBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, 1,
+                         cart_comm, req+nreq);
       assert(retval == MPI_SUCCESS);
+      nreq++;
     }
 
-    if (NDIMS >= 2)
+    if ((ipS != MPI_PROC_NULL) && (!upwindRight))
     {
-      if (ipS != MPI_PROC_NULL)
-      {
-        retval = MPI_Isend(getSendBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, 2,
-                           cart_comm, req+8);
-        assert(retval == MPI_SUCCESS);
-      }
-
-      if (ipN != MPI_PROC_NULL)
-      {
-        retval = MPI_Isend(getSendBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, 3,
-                           cart_comm, req+9);
-        assert(retval == MPI_SUCCESS);
-      }
-    }
-
-    if (NDIMS == 3)
+      retval = MPI_Isend(getSendBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, 2,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipN != MPI_PROC_NULL) && (upwindRight))
     {
-      if (ipB != MPI_PROC_NULL)
-      {
-        retval = MPI_Isend(getSendBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, 4,
-                           cart_comm, req+10);
-        assert(retval == MPI_SUCCESS);
-      }
+      retval = MPI_Isend(getSendBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, 3,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
 
-      if (ipF != MPI_PROC_NULL)
-      {
-        retval = MPI_Isend(getSendBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, 5,
-                           cart_comm, req+11);
-        assert(retval == MPI_SUCCESS);
-      }
+    if ((ipB != MPI_PROC_NULL) && (!upwindRight))
+    {
+      retval = MPI_Isend(getSendBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, 4,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipF != MPI_PROC_NULL) && (upwindRight))
+    {
+      retval = MPI_Isend(getSendBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, 5,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
     }
 
     return retval;
@@ -359,8 +351,12 @@ class ParallelGrid
     MPI_Status stat[12];
     int retval;
 
+    // return automatically with success if there are no outstanding requests
+    if (nreq == 0)
+      return(0);
+
     // Wait for messages to finish send/receive
-    retval = MPI_Waitall(12, req, stat);
+    retval = MPI_Waitall(nreq, req, stat);
     assert(retval == MPI_SUCCESS);
 
     return retval;
@@ -370,12 +366,16 @@ class ParallelGrid
   void PrintInfo()
   {
     printf("ParallelGrid Info:\n");
-    printf("    dimensions = %d\n", NDIMS);
+    printf("    dimensions = %d\n", 3);
     printf("    processors = {%d, %d, %d}\n", npx, npy, npz);
     printf("        domain = {[%g,%g], [%g,%g], [%g,%g]}\n", ax, bx, ay, by, az, bz);
     printf("   global npts = {%li, %li, %li}\n", (long int) nx, (long int) ny, (long int) nz);
     printf("    local npts = {%d, %d, %d}\n", nxl, nyl, nzl);
     printf("  mesh spacing = {%g, %g, %g}\n", dx, dy, dz);
+    if (upwindRight)
+      printf("    upwind dir = right\n");
+    else
+      printf("    upwind dir = left\n");
   }
 
   // Saves the mesh to a file.
@@ -407,16 +407,12 @@ class ParallelGrid
 
   GLOBALINT npts() const
   {
-    if (NDIMS == 1) return nx;
-    if (NDIMS == 2) return nx*ny;
-    if (NDIMS == 3) return nx*ny*nz;
+    return nx*ny*nz;
   }
 
   GLOBALINT nptsl() const
   {
-    if (NDIMS == 1) return nxl;
-    if (NDIMS == 2) return nxl*nyl;
-    if (NDIMS == 3) return nxl*nyl*nzl;
+    return nxl*nyl*nzl;
   }
 
   GLOBALINT neql() const
@@ -452,6 +448,7 @@ class ParallelGrid
     }
     else
     {
+      assert(direction == "ILLEGAL");
       return nullptr;
     }
   }
@@ -484,24 +481,28 @@ class ParallelGrid
     }
     else
     {
+      assert(direction == "ILLEGAL");
       return nullptr;
     }
   }
 
   ~ParallelGrid()
   {
-    SUNMemoryHelper_Dealloc(memhelp, Esend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Wsend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Nsend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Ssend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Fsend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Bsend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Erecv_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Wrecv_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Nrecv_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Srecv_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Frecv_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Brecv_, nullptr);
+    if (upwindRight) {
+      SUNMemoryHelper_Dealloc(memhelp, Esend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Nsend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Fsend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Wrecv_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Srecv_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Brecv_, nullptr);
+    } else {
+      SUNMemoryHelper_Dealloc(memhelp, Wsend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Ssend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Bsend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Erecv_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Nrecv_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Frecv_, nullptr);
+    }
   }
 
   GLOBALINT nx, ny, nz;    /* number of intervals globally       */
@@ -516,6 +517,7 @@ class ParallelGrid
   int       ipW, ipE;      /* MPI ranks for neighbor procs       */
   int       ipS, ipN;
   int       ipB, ipF;
+  bool      upwindRight;   /* Upwind dir: true/false == R/L      */
 
   int       dims[3];
   int       coords[3];
@@ -524,6 +526,7 @@ class ParallelGrid
 private:
   MPI_Comm     cart_comm;  /* MPI cartesian communicator         */
   MPI_Request  req[12];
+  int          nreq;
 
   BoundaryType bc;
   StencilType  st;
diff --git a/benchmarks/advection_reaction_3D/README.md b/benchmarks/advection_reaction_3D/raja/README.md
similarity index 78%
rename from benchmarks/advection_reaction_3D/README.md
rename to benchmarks/advection_reaction_3D/raja/README.md
index ab9974b660..33c82db725 100644
--- a/benchmarks/advection_reaction_3D/README.md
+++ b/benchmarks/advection_reaction_3D/raja/README.md
@@ -8,27 +8,31 @@ RAJA performance portability layer with serial, CUDA, or HIP backends.
 This code simulates the advection and reaction of three chemical species where
 the reaction mechanism is a variation of the Brusselator problem from chemical
 kinetics. The PDE system is given by
+```math
+\begin{align}
+  u_t &= -c \nabla u + A - (w+1) u + v u^2 \\
+  v_t &= -c \nabla v + w u - v u^2 \\
+  w_t &= -c \nabla w + (B - w) / \epsilon - w u
+\end{align}
 ```
-    u_t = -c grad(u) + A - (w+1) * u + v * u^2
-    v_t = -c grad(v) + w * u - v * u^2
-    w_t = -c grad(w) + (B - w) / epsilon - w * u
-```
-where `u`, `v`, and `w` are chemical concentrations, `c` is the advection speed,
-`A` and `B` are the concentrations of chemical species that remain constant over
-space and time, and `epsilon` is a parameter that varies the stiffness of the
-system. The problem is solved on the domain `(x,y,z) = X` in `[0, X_max]^3`,
-for times `t` in `[0,t_f]`. The initial condition is
-```
-    u(0,X) = A + p(X)
-    v(0,X) = B / A + p(X)
-    w(0,X) = 3.0 + p(X)
+where $u$, $v$, and $w$ are chemical concentrations, $c$ is the advection speed,
+$A$ and $B$ are the concentrations of chemical species that remain constant over
+space and time, and $\epsilon$ is a parameter that varies the stiffness of the
+system. The problem is solved on the domain $(x,y,z) = X$ in $[0, X_{\text{max}}]^3$,
+for times $t$ in $[0,t_f]$. The initial condition is
+```math
+\begin{align}
+    u(0,X) &= A + p(X) \\
+    v(0,X) &= B / A + p(X) \\
+    w(0,X) &= 3.0 + p(X)
+\end{align}
 ```
 where the perturbation function is
+```math
+    p(X) = \alpha e^{-(X-\mu)^T \sigma^{-1} (X-\mu) / 2 \sqrt{|\sigma| 8 \pi^3}}
 ```
-    p(X) = alpha * e^( -((X-mu)^T sigma^{-1} (X-mu)) / (2*sqrt(|sigma| 8 pi^3)) )
-```
-with `alpha = 0.1`, `mu = 0.5 X_max`, and `sigma` is a diagonal matrix with
-entries `0.25 X_max`.
+with $\alpha = 0.1$, $\mu = 0.5 X_{\text{max}}$, and $\sigma$ is a diagonal 
+matrix with entries $0.25 X_{\text{max}}$.
 
 Spatial derivatives are discretized with first-order upwind finite differences
 on a uniform spatial grid. The system can be evolved in time using explicit,
@@ -64,7 +68,7 @@ listed below.
 | `--method <method>`         | Integrator to use: `ERK`, `ARK-DIRK`, `ARK-IMEX`, `CV-BDF`, `CV-ADAMS`, `IDA` | `ARK-DIRK`  |
 | `--nls <method>`            | Nonlinear Solver Method: `newton`, `tl-newton`, `fixedpoint`, `none`          | `newton`    |
 | `--fpaccel <int>`           | Number of fixed point acceleration vectors                                    | 3           |
-| `--nopre`                   | Disable preconditioning                                                       | False       | 
+| `--nopre`                   | Disable preconditioning                                                       | False       |
 | `--fused`                   | Enabled fused operations                                                      | Off         |
 | `--tf <realtype>`           | Final integration time `t_f`                                                  | 10.0        |
 | `--rtol <realtype>`         | Relative tolerance                                                            | 1.0e-6      |
@@ -79,11 +83,11 @@ GPUs. See the installation guide for more details on configuring, building,
 and installing SUNDIALS.
 
 Based on the configuration the following executables will be built and installed
-in the `<install prefix>/bin/benchmarks/advection_reaction_3D` directory:
+in the `<benchmarks install prefix>/advection_reaction_3D/raja` directory:
 
-* `advection_reaction_3D` -- MPI parallelism
-* `advection_reaction_3D_mpicuda` -- MPI + CUDA parallelism
-* `advection_reaction_3D_mpihip` -- MPI + HIP parallelism
+* `advection_reaction_3D_raja` -- MPI parallelism
+* `advection_reaction_3D_raja_mpicuda` -- MPI + CUDA parallelism
+* `advection_reaction_3D_raja_mpihip` -- MPI + HIP parallelism
 
 On Summit, with the default environment
 ```
@@ -93,7 +97,7 @@ On Summit, with the default environment
 ```
 an example `jsrun` command is
 ```
-jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_mpicuda
+jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_raja_mpicuda
 ```
 
 On Lassen, with the environment
@@ -104,5 +108,5 @@ On Lassen, with the environment
 ```
 an example `jsrun` command is
 ```
-jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_mpicuda
+jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_raja_mpicuda
 ```
diff --git a/benchmarks/advection_reaction_3D/advection_reaction_3D.cpp b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.cpp
similarity index 71%
rename from benchmarks/advection_reaction_3D/advection_reaction_3D.cpp
rename to benchmarks/advection_reaction_3D/raja/advection_reaction_3D.cpp
index dc169c5fa1..088e4536a0 100644
--- a/benchmarks/advection_reaction_3D/advection_reaction_3D.cpp
+++ b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.cpp
@@ -1,5 +1,6 @@
 /* -----------------------------------------------------------------------------
  * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL
+ *                Daniel R. Reynolds @ SMU
  * -----------------------------------------------------------------------------
  * SUNDIALS Copyright Start
  * Copyright (c) 2002-2023, Lawrence Livermore National Security
@@ -60,9 +61,13 @@
 
 #include "advection_reaction_3D.hpp"
 
+#define STENCIL_WIDTH 1
+
+
 /* Main Program */
 int main(int argc, char *argv[])
 {
+
   SUNContext ctx;
 
   /* Initialize MPI */
@@ -87,7 +92,6 @@ int main(int argc, char *argv[])
     UserData     udata(ctx);    /* user data                    */
     UserOptions  uopt;          /* user options                 */
     int          retval;        /* reusable error-checking flag */
-    char         fname[MXSTR];
 
     SUNDIALS_CXX_MARK_FUNCTION(udata.prof);
 
@@ -113,6 +117,7 @@ int main(int argc, char *argv[])
     /* Output spatial mesh to disk (add extra point for periodic BC) */
     if (udata.myid == 0 && uopt.nout > 0)
     {
+      char fname[MXSTR];
       snprintf(fname, MXSTR, "%s/mesh.txt", uopt.outputdir);
       udata.grid->MeshToFile(fname);
     }
@@ -124,7 +129,6 @@ int main(int argc, char *argv[])
     else if (uopt.method == "CV-BDF")   retval = EvolveProblemBDF(y, &udata, &uopt);
     else if (uopt.method == "CV-ADAMS") retval = EvolveProblemAdams(y, &udata, &uopt);
     else if (uopt.method == "IDA")      retval = EvolveDAEProblem(y, &udata, &uopt);
-
     if (check_retval(&retval, "Evolve", 1, udata.myid)) MPI_Abort(comm, 1);
 
     /* Clean up */
@@ -142,15 +146,6 @@ int main(int argc, char *argv[])
 /* Destructor for problem data */
 UserData::~UserData()
 {
-  /* free solution masks */
-  N_VDestroy(N_VGetLocalVector_MPIPlusX(umask));
-  N_VDestroy(umask);
-  N_VDestroy(vmask);
-  N_VDestroy(wmask);
-
-  /* free the parallel grid */
-  delete grid;
-
   /* close output streams */
   if (uopt->nout > 0)
   {
@@ -159,6 +154,24 @@ UserData::~UserData()
     if (WFID) fclose(WFID);
     if (TFID && myid == 0) fclose(TFID);
   }
+
+  /* free solution masks */
+  if (umask != nullptr) {
+    N_VDestroy(N_VGetLocalVector_MPIPlusX(umask));
+    N_VDestroy(umask);
+    umask = nullptr;
+  }
+  if (vmask != nullptr) {
+    N_VDestroy(vmask);
+    vmask = nullptr;
+  }
+  if (wmask != nullptr) {
+    N_VDestroy(wmask);
+    wmask = nullptr;
+  }
+
+  /* free the parallel grid */
+  delete grid;
 }
 
 
@@ -166,175 +179,98 @@ UserData::~UserData()
  * Communication functions
  * --------------------------------------------------------------*/
 
-/* Exchanges the boundary conditions only, */
-int ExchangeBCOnly(N_Vector y, UserData* udata)
+/* Fills send buffers before exchanging neighbor information */
+int FillSendBuffers(N_Vector y, UserData* udata)
 {
-  int ierr;
-  MPI_Status stat;
-  MPI_Request reqR, reqS;
 
   /* shortcuts */
-  int nvar  = udata->grid->dof;
-  int myid  = udata->myid;
-  int first = 0;
-  int last  = udata->nprocs - 1;
+  const realtype c = udata->c;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const int dof = udata->grid->dof;
 
-  /* extract the data */
-  realtype* Ydata = GetVecData(y);
-  realtype* Wsend = udata->grid->getSendBuffer("WEST");
+  /* Create a 4D view of the vector */
+  RAJA::View<realtype, RAJA::Layout<4> > Yview(GetVecData(y),
+                                               nxl, nyl, nzl, dof);
 
-  /* open the East Irecv buffer */
-  if (myid == last)
-  {
-    ierr = MPI_Irecv(udata->grid->getRecvBuffer("EAST"), nvar, MPI_SUNREALTYPE, first,
-                     MPI_ANY_TAG, udata->comm, &reqR);
-  }
-
-  /* send first mesh node to the last processor */
-  if (myid == first)
-  {
-    RAJA::forall< EXEC_POLICY >( RAJA::RangeSegment(0, nvar),
-      [=] DEVICE_FUNC (int var) {
-      Wsend[IDX(nvar, 0, var)] = Ydata[IDX(nvar, 0, var)];
-    });
-    ierr = MPI_Isend(Wsend, nvar, MPI_SUNREALTYPE,
-                     last, 0, udata->comm, &reqS);
-  }
-
-  if (myid == last)
-  {
-    /* wait for exchange to finish */
-    ierr = MPI_Wait(&reqR, &stat);
-    if (ierr != MPI_SUCCESS)
-    {
-      fprintf(stderr, "\nERROR: error in MPI_Wait = %d\n", ierr);
-      return -1;
-    }
-  }
-
-  if (myid == first)
+  if (c > 0.0)
   {
-    /* wait for exchange to finish */
-    ierr = MPI_Wait(&reqS, &stat);
-    if (ierr != MPI_SUCCESS)
-    {
-      fprintf(stderr, "\nERROR: error in MPI_Wait = %d\n", ierr);
-      return -1;
-    }
-  }
-
-  return(0);
-}
 
+    /* Flow moving in the positive directions uses backward difference. */
 
-/* Starts the exchange of the neighbor information */
-int ExchangeAllStart(N_Vector y, UserData* udata)
-{
-  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
+    /* Fill 3D views of send buffers on device */
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Esend(udata->grid->getSendBuffer("EAST"),  nyl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Nsend(udata->grid->getSendBuffer("NORTH"), nxl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Fsend(udata->grid->getSendBuffer("FRONT"), nxl, nyl, dof);
+
+    auto east_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, nzl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(east_face,
+      [=] DEVICE_FUNC (int j, int k, int l) {
+        Esend(j,k,l) = Yview(nxl-1,j,k,l);
+    });
 
-  /* shortcuts */
-  realtype c = udata->c;
+    auto north_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nzl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(north_face,
+      [=] DEVICE_FUNC (int i, int k, int l) {
+        Nsend(i,k,l) = Yview(i,nyl-1,k,l);
+    });
 
-  /* extract the data */
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(GetVecData(y),
-                                                     udata->grid->nxl,
-                                                     udata->grid->nyl,
-                                                     udata->grid->nzl,
-                                                     udata->grid->dof);
+    auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nyl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(front_face,
+      [=] DEVICE_FUNC (int i, int j, int l) {
+        Fsend(i,j,l) = Yview(i,j,nzl-1,l);
+    });
 
-  if (c > 0.0)
-  {
-    /* Flow moving in the positive directions uses backward difference. */
-    udata->grid->ExchangeStart(
-      [=] (realtype*, realtype* Esend, realtype*, realtype* Nsend, realtype* Bsend, realtype*) {
-        int nxl = udata->grid->nxl;
-        int nyl = udata->grid->nyl;
-        int nzl = udata->grid->nzl;
-        int dof = udata->grid->dof;
-
-        auto range = RAJA::make_tuple(RAJA::RangeSegment(0, std::max(1,nxl-1)),
-                                      RAJA::RangeSegment(0, std::max(1,nyl-1)),
-                                      RAJA::RangeSegment(0, std::max(1,nzl-1)));
-
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Eview(Esend, nyl, nzl, dof);
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Nview(Nsend, nxl, nzl, dof);
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Bview(Bsend, nxl, nyl, dof);
-
-        RAJA::kernel<XYZ_KERNEL_POL>(range,
-          [=] DEVICE_FUNC (int i, int j, int k) {
-
-          if (nxl > 1)
-          {
-            Eview(j,k,0) = Yview(nxl-1,j,k,0);
-            Eview(j,k,1) = Yview(nxl-1,j,k,1);
-            Eview(j,k,2) = Yview(nxl-1,j,k,2);
-          }
-
-          if (nyl > 1)
-          {
-            Nview(i,k,0) = Yview(i,nyl-1,k,0);
-            Nview(i,k,1) = Yview(i,nyl-1,k,1);
-            Nview(i,k,2) = Yview(i,nyl-1,k,2);
-          }
-
-          if (nzl > 1)
-          {
-            Bview(i,j,0) = Yview(i,j,nzl-1,0);
-            Bview(i,j,1) = Yview(i,j,nzl-1,1);
-            Bview(i,j,2) = Yview(i,j,nzl-1,2);
-          }
-
-        });
-      });
   }
   else if (c < 0.0)
   {
+
     /* Flow moving in the negative directions uses forward difference. */
 
-    udata->grid->ExchangeStart(
-      [=] (realtype* Wsend, realtype*, realtype*Ssend, realtype*, realtype*, realtype* Fsend) {
-        auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl-1),
-                                      RAJA::RangeSegment(0, udata->grid->nyl-1),
-                                      RAJA::RangeSegment(0, udata->grid->nzl-1));
-
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Wview(Wsend, udata->grid->nyl, udata->grid->nzl, udata->grid->dof);
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Sview(Ssend, udata->grid->nxl, udata->grid->nzl, udata->grid->dof);
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Fview(Fsend, udata->grid->nxl, udata->grid->nyl, udata->grid->dof);
-
-        RAJA::kernel<XYZ_KERNEL_POL>(range,
-          [=] DEVICE_FUNC (int i, int j, int k) {
-          Wview(j,k,0) = Yview(0,j,k,0);
-          Wview(j,k,1) = Yview(0,j,k,1);
-          Wview(j,k,2) = Yview(0,j,k,2);
-
-          Sview(i,k,0) = Yview(i,0,k,0);
-          Sview(i,k,1) = Yview(i,0,k,1);
-          Sview(i,k,2) = Yview(i,0,k,2);
-
-          Fview(i,j,0) = Yview(i,j,0,0);
-          Fview(i,j,1) = Yview(i,j,0,1);
-          Fview(i,j,2) = Yview(i,j,0,2);
-        });
-      });
-  }
+    /* Fill 3D views of send buffers on device */
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Wsend(udata->grid->getSendBuffer("WEST"),  nyl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Ssend(udata->grid->getSendBuffer("SOUTH"), nxl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Bsend(udata->grid->getSendBuffer("BACK"),  nxl, nyl, dof);
+
+    auto west_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, nzl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(west_face,
+      [=] DEVICE_FUNC (int j, int k, int l) {
+        Wsend(j,k,l) = Yview(0,j,k,l);
+    });
 
-  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
-  return(0);
-}
+    auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nzl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(south_face,
+      [=] DEVICE_FUNC (int i, int k, int l) {
+        Ssend(i,k,l) = Yview(i,0,k,l);
+    });
 
+    auto back_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                      RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(back_face,
+      [=] DEVICE_FUNC (int i, int j, int l) {
+        Bsend(i,j,l) = Yview(i,j,0,l);
+    });
+
+  }
 
-/* Completes the exchange of the neighbor information */
-int ExchangeAllEnd(UserData* udata)
-{
-  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
-  udata->grid->ExchangeEnd();
-  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
   return(0);
 }
 
@@ -494,17 +430,20 @@ int ComponentMask(N_Vector mask, int component, const UserData* udata)
 
   N_VConst(0.0, mask);
 
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > mask_view(GetVecData(mask),
-                                                         udata->grid->nxl,
-                                                         udata->grid->nyl,
-                                                         udata->grid->nzl,
-                                                         udata->grid->dof);
+  /* Create 4D view of mask data */
+  RAJA::View<realtype, RAJA::Layout<4> > mask_view(GetVecData(mask),
+                                                   udata->grid->nxl,
+                                                   udata->grid->nyl,
+                                                   udata->grid->nzl,
+                                                   udata->grid->dof);
+  /* Fill mask data */
   auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
                                 RAJA::RangeSegment(0, udata->grid->nyl),
                                 RAJA::RangeSegment(0, udata->grid->nzl));
   RAJA::kernel<XYZ_KERNEL_POL>(range,
-    [=] DEVICE_FUNC (int xi, int yi, int zi) {
-    mask_view(xi,yi,zi,component) = 1.0;
+    [=] DEVICE_FUNC (int i, int j, int k)
+  {
+    mask_view(i,j,k,component) = 1.0;
   });
 
   return 0;
@@ -515,14 +454,9 @@ int ComponentMask(N_Vector mask, int component, const UserData* udata)
 int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
                  SUNMemoryHelper memhelper, SUNContext ctx)
 {
-  constexpr int STENCIL_WIDTH = 1;
 
   SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
 
-  /* Local variables */
-  int retval = 0;
-  char fname[MXSTR];
-
   /* MPI variables */
   udata->comm = MPI_COMM_WORLD;
   MPI_Comm_rank(udata->comm, &udata->myid);
@@ -567,16 +501,16 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
   uopt->outputdir = (char *) "."; /* output directory         */
 
   /* Parse CLI args and set udata/uopt appropriately */
-  retval = ParseArgs(argc, argv, udata, uopt);
+  int retval = ParseArgs(argc, argv, udata, uopt);
   if (check_retval((void*)&retval, "ParseArgs", 1, udata->myid)) return -1;
 
   /* Setup the parallel decomposition */
   const sunindextype npts[] = {uopt->npts, uopt->npts, uopt->npts};
   const realtype amax[] = {0.0, 0.0, 0.0};
   const realtype bmax[] = {udata->xmax, udata->xmax, udata->xmax};
-  udata->grid = new ParallelGrid<realtype,sunindextype,NDIMS>(memhelper,
-    &udata->comm, amax, bmax, npts, 3, BoundaryType::PERIODIC, StencilType::UPWIND, STENCIL_WIDTH, uopt->npxyz
-  );
+  udata->grid = new ParallelGrid<realtype,sunindextype>(memhelper, &udata->comm,
+    amax, bmax, npts, 3, BoundaryType::PERIODIC, StencilType::UPWIND, udata->c,
+    STENCIL_WIDTH, uopt->npxyz);
 
   /* Create the solution masks */
   udata->umask = N_VMake_MPIPlusX(udata->comm, LocalNvector(udata->grid->neq, ctx), ctx);
@@ -589,6 +523,7 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
   /* Open output files for results */
   if (uopt->save)
   {
+    char fname[MXSTR];
     if (udata->myid == 0)
     {
       sprintf(fname, "%s/t.%06d.txt", uopt->outputdir, udata->myid);
@@ -609,7 +544,7 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
   if (udata->myid == 0)
   {
     printf("\n\t\tAdvection-Reaction Test Problem\n\n");
-    printf("Using the %s NVECTOR\n", NVECTOR_ID_STRING);
+    printf("Using the MPI+%s NVECTOR\n", NVECTOR_ID_STRING);
     printf("Number of Processors = %li\n", (long int) udata->nprocs);
     udata->grid->PrintInfo();
     printf("Problem Parameters:\n");
@@ -632,7 +567,6 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
     printf("Output directory: %s\n", uopt->outputdir);
   }
 
-
   /* return success */
   return(0);
 }
@@ -644,8 +578,8 @@ void Gaussian3D(realtype& x, realtype& y, realtype& z, realtype xmax)
 {
   /* Gaussian distribution defaults */
   const realtype alpha = 0.1;
-  const realtype mu[3] = { xmax/RCONST(2.0), xmax/RCONST(2.0), xmax/RCONST(2.0) };
-  const realtype sigma[3] = { xmax/RCONST(4.0), xmax/RCONST(4.0), xmax/RCONST(4.0) }; // Sigma = diag(sigma)
+  const realtype mu[] = { xmax/RCONST(2.0), xmax/RCONST(2.0), xmax/RCONST(2.0) };
+  const realtype sigma[] = { xmax/RCONST(4.0), xmax/RCONST(4.0), xmax/RCONST(4.0) }; // Sigma = diag(sigma)
 
   /* denominator = 2*sqrt(|Sigma|*(2pi)^3) */
   const realtype denom = 2.0 * sqrt((sigma[0]*sigma[1]*sigma[2])*pow(2*M_PI,3));
@@ -664,6 +598,7 @@ int SetIC(N_Vector y, UserData* udata)
   const int      nxl  = udata->grid->nxl;
   const int      nyl  = udata->grid->nyl;
   const int      nzl  = udata->grid->nzl;
+  const int      dof  = udata->grid->dof;
   const realtype dx   = udata->grid->dx;
   const realtype dy   = udata->grid->dy;
   const realtype dz   = udata->grid->dz;
@@ -683,22 +618,25 @@ int SetIC(N_Vector y, UserData* udata)
   const realtype vs = k2 * k4 * B / (k1 * k3 * A);
   const realtype ws = 3.0;
 
+  /* Create 4D view of y */
+  RAJA::View<realtype, RAJA::Layout<4> > yview(GetVecData(y),
+                                               nxl, nyl, nzl, dof);
+
   /* Gaussian perturbation of the steady state solution */
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > yview(GetVecData(y), nxl, nyl, nzl,
-                                                     udata->grid->dof);
   auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
                                 RAJA::RangeSegment(0, nyl),
                                 RAJA::RangeSegment(0, nzl));
   RAJA::kernel<XYZ_KERNEL_POL>(range,
-    [=] DEVICE_FUNC (int xi, int yi, int zi) {
-    realtype x = (xcrd * nxl + xi) * dx;
-    realtype y = (ycrd * nyl + yi) * dy;
-    realtype z = (zcrd * nzl + zi) * dz;
+    [=] DEVICE_FUNC (int i, int j, int k)
+  {
+    realtype x = (xcrd * nxl + i) * dx;
+    realtype y = (ycrd * nyl + j) * dy;
+    realtype z = (zcrd * nzl + k) * dz;
     Gaussian3D(x,y,z,xmax);
     const realtype p = x + y + z;
-    yview(xi,yi,zi,0) = us + p;
-    yview(xi,yi,zi,1) = vs + p;
-    yview(xi,yi,zi,2) = ws + p;
+    yview(i,j,k,0) = us + p;
+    yview(i,j,k,1) = vs + p;
+    yview(i,j,k,2) = ws + p;
   });
 
   /* Return success */
@@ -710,23 +648,17 @@ int SetIC(N_Vector y, UserData* udata)
 int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt)
 {
   SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
-  
-  realtype  u, v, w, N;
-  realtype* ydata = NULL;
-
-  /* get vector data array */
-  ydata = N_VGetArrayPointer(y);
-  if (check_retval((void *) ydata, "N_VGetArrayPointer", 0, udata->myid)) return -1;
 
+  /* Copy solution data to host mirror view */
   CopyVecFromDevice(N_VGetLocalVector_MPIPlusX(y));
 
   /* output current solution norm to screen */
-  N = (realtype) udata->grid->npts();
-  u = N_VWL2Norm(y, udata->umask);
+  realtype N = (realtype) udata->grid->npts();
+  realtype u = N_VWL2Norm(y, udata->umask);
   u = sqrt(u*u/N);
-  v = N_VWL2Norm(y, udata->vmask);
+  realtype v = N_VWL2Norm(y, udata->vmask);
   v = sqrt(v*v/N);
-  w = N_VWL2Norm(y, udata->wmask);
+  realtype w = N_VWL2Norm(y, udata->wmask);
   w = sqrt(w*w/N);
   if (udata->myid == 0) {
     printf("     %10.6f   %10.6f   %10.6f   %10.6f\n", t, u, v, w);
@@ -736,32 +668,38 @@ int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt)
   if (uopt->save)
   {
     /* output the times to disk */
-    if (udata->myid == 0 && udata->TFID)
+    if (udata->myid == 0 && udata->TFID) {
       fprintf(udata->TFID," %.16e\n", t);
+      std::fflush(udata->TFID);
+    }
+
+    /* create 4D view of host data */
+    realtype* ydata = NULL;
+    ydata = N_VGetArrayPointer(y);
+    if (check_retval((void *) ydata, "N_VGetArrayPointer", 0, udata->myid)) return -1;
+    const int nxl = udata->grid->nxl;
+    const int nyl = udata->grid->nyl;
+    const int nzl = udata->grid->nzl;
+    const int dof = udata->grid->dof;
+    RAJA::View<realtype, RAJA::Layout<4> > Yview(ydata, nxl, nyl, nzl, dof);
 
     /* output results to disk */
-    RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(ydata,
-                                                       udata->grid->nxl,
-                                                       udata->grid->nyl,
-                                                       udata->grid->nzl,
-                                                       udata->grid->dof);
-
-    auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
-                                  RAJA::RangeSegment(0, udata->grid->nyl),
-                                  RAJA::RangeSegment(0, udata->grid->nzl));
-
-    RAJA::kernel<XYZ_KERNEL_SERIAL_POLICY>(range,
-      [=] (int i, int j, int k) {
-      fprintf(udata->UFID," %.16e", Yview(i,j,k,0));
-      fprintf(udata->VFID," %.16e", Yview(i,j,k,1));
-      fprintf(udata->WFID," %.16e", Yview(i,j,k,2));
-    });
+    for (int i = 0; i < nxl; i++)
+      for (int j = 0; j < nyl; j++)
+        for (int k = 0; k < nzl; k++) {
+          fprintf(udata->UFID," %.16e", Yview(i,j,k,0));
+          fprintf(udata->VFID," %.16e", Yview(i,j,k,1));
+          fprintf(udata->WFID," %.16e", Yview(i,j,k,2));
+        }
 
     fprintf(udata->UFID,"\n");
     fprintf(udata->VFID,"\n");
     fprintf(udata->WFID,"\n");
+    std::fflush(udata->UFID);
+    std::fflush(udata->VFID);
+    std::fflush(udata->WFID);
   }
-  
+
   return(0);
 }
 
@@ -799,4 +737,3 @@ void InputError(char *name)
 
   MPI_Barrier(MPI_COMM_WORLD);
 }
-
diff --git a/benchmarks/advection_reaction_3D/advection_reaction_3D.hpp b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.hpp
similarity index 91%
rename from benchmarks/advection_reaction_3D/advection_reaction_3D.hpp
rename to benchmarks/advection_reaction_3D/raja/advection_reaction_3D.hpp
index 4396e69eb5..e4227d62c7 100644
--- a/benchmarks/advection_reaction_3D/advection_reaction_3D.hpp
+++ b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.hpp
@@ -1,5 +1,6 @@
 /* -----------------------------------------------------------------------------
  * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL
+ *                Daniel R. Reynolds @ SMU
  * -----------------------------------------------------------------------------
  * SUNDIALS Copyright Start
  * Copyright (c) 2002-2023, Lawrence Livermore National Security
@@ -34,19 +35,9 @@ using sundials_tools::BoundaryType;
 using sundials_tools::StencilType;
 using std::string;
 
-/* Number of dimensions */
-constexpr int NDIMS = 3;
-
 /* Maximum size of output directory string */
 constexpr int MXSTR = 2048;
 
-/* Accessor macro:
-   n = number of state variables
-   i = mesh node index
-   c = component */
-#define IDX(n,i,c) ((n)*(i)+(c))
-
-
 /*
  * Data structure for problem options
  */
@@ -113,7 +104,7 @@ struct UserData
   realtype  c;    /* advection coefficient        */
 
   /* parallel mesh */
-  ParallelGrid<realtype,sunindextype,NDIMS>* grid;
+  ParallelGrid<realtype,sunindextype>* grid;
 
   /* count of implicit function evals by the task local nonlinear solver */
   long int nnlfi;
@@ -122,7 +113,10 @@ struct UserData
   UserOptions* uopt;
 
   /* constructor that takes the context */
-  UserData(SUNContext ctx) : ctx(ctx) {
+  UserData(SUNContext ctx)
+    : ctx(ctx), umask(nullptr), vmask(nullptr), wmask(nullptr), uopt(nullptr),
+      TFID(nullptr), UFID(nullptr), VFID(nullptr), WFID(nullptr)
+  {
     SUNContext_GetProfiler(ctx, &prof);
   }
 
@@ -161,15 +155,14 @@ extern int EvolveDAEProblem(N_Vector y, UserData* udata, UserOptions* uopt);
 /* function to set initial condition */
 int SetIC(N_Vector y, UserData* udata);
 
-/* functions to exchange neighbor data */
-int ExchangeBCOnly(N_Vector y, UserData* udata);
-int ExchangeAllStart(N_Vector y, UserData* udata);
-int ExchangeAllEnd(UserData* udata);
+/* function to fill neighbor data */
+int FillSendBuffers(N_Vector y, UserData* udata);
 
 /* functions for processing command line args */
 int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
                  SUNMemoryHelper memhelper, SUNContext ctx);
 void InputError(char *name);
+int ComponentMask(N_Vector mask, const int component, const UserData* udata);
 
 /* function to write solution to disk */
 int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt);
diff --git a/benchmarks/advection_reaction_3D/raja/arkode_driver.cpp b/benchmarks/advection_reaction_3D/raja/arkode_driver.cpp
new file mode 100644
index 0000000000..e2cf1451e3
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/arkode_driver.cpp
@@ -0,0 +1,782 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * ---------------------------------------------------------------------------*/
+
+#include "arkode/arkode_arkstep.h"
+#include "arkode/arkode_erkstep.h"
+#include "sunlinsol/sunlinsol_spgmr.h"
+#include "sunnonlinsol/sunnonlinsol_newton.h"
+#include "sunnonlinsol/sunnonlinsol_fixedpoint.h"
+#include "advection_reaction_3D.hpp"
+#include "rhs3D.hpp"
+
+/*
+ * Definitions for a custom task local SUNNonlinearSolver
+ */
+
+typedef struct
+{
+  int                myid;
+  int                nprocs;
+  long int           ncnf;
+  MPI_Comm           comm;
+  SUNNonlinearSolver local_nls;
+} *TaskLocalNewton_Content;
+
+/* Content accessor macors */
+#define GET_NLS_CONTENT(NLS) ( (TaskLocalNewton_Content)(NLS->content) )
+#define LOCAL_NLS(NLS)       ( GET_NLS_CONTENT(NLS)->local_nls )
+
+/* SUNNonlinearSolver constructor */
+SUNNonlinearSolver TaskLocalNewton(SUNContext ctx, N_Vector y, FILE* DFID);
+
+
+/* --------------------------------------------------------------
+ * Evolve functions
+ * --------------------------------------------------------------*/
+
+/* Setup ARKODE and evolve problem in time with IMEX method */
+int EvolveProblemDIRK(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*              arkode_mem = NULL;  /* empty ARKODE memory structure    */
+  SUNNonlinearSolver NLS = NULL;         /* empty nonlinear solver structure */
+  SUNLinearSolver    LS  = NULL;         /* empty linear solver structure    */
+
+  realtype t, dtout, tout;    /* current/output time data     */
+  int      retval;            /* reusable error-checking flag */
+  int      iout;              /* output counter               */
+  long int nst, nst_a, netf;  /* step stats                   */
+  long int nfe, nfi;          /* RHS stats                    */
+  long int nni, ncnf;         /* nonlinear solver stats       */
+  long int nli, npsol;        /* linear solver stats          */
+  FILE*    DFID = NULL;       /* diagnostics output file      */
+  char     fname[MXSTR];
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = true;
+
+  /* Create the ARK timestepper module */
+  arkode_mem = ARKStepCreate(NULL, AdvectionReaction, uopt->t0, y, udata->ctx);
+  if (check_retval((void*)arkode_mem, "ARKStepCreate", 0, udata->myid)) return 1;
+
+  /* Select the method order */
+  retval = ARKStepSetOrder(arkode_mem, uopt->order);
+  if (check_retval(&retval, "ARKStepSetOrder", 1, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = ARKStepSetUserData(arkode_mem, (void*) udata);
+  if (check_retval(&retval, "ARKStepSetUserData*", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = ARKStepSStolerances(arkode_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "ARKStepSStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = ARKStepSetMaxNumSteps(arkode_mem, 100000);
+  if (check_retval(&retval, "ARKStepSetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Open output file for integrator diagnostics */
+  if (uopt->save)
+  {
+    sprintf(fname, "%s/diagnostics.%06d.txt", uopt->outputdir, udata->myid);
+    DFID = fopen(fname, "w");
+
+    retval = ARKStepSetDiagnostics(arkode_mem, DFID);
+    if (check_retval(&retval, "ARKStepSetDiagnostics", 1, udata->myid)) return 1;
+  }
+
+  /* Create the (non)linear solver */
+  if (uopt->nls == "newton")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_Newton(y, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = ARKStepSetNonlinearSolver(arkode_mem, NLS);
+    if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1;
+
+    /* Create linear solver */
+    LS = uopt->precond ? SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx) : SUNLinSol_SPGMR(y, PREC_NONE, 0, udata->ctx);
+    if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1;
+
+    /* Attach linear solver */
+    retval = ARKStepSetLinearSolver(arkode_mem, LS, NULL);
+    if (check_retval(&retval, "ARKStepSetLinearSolver", 1, udata->myid)) return 1;
+
+    /* Attach preconditioner */
+    retval = ARKStepSetPreconditioner(arkode_mem, NULL, PSolve);
+    if (check_retval(&retval, "ARKStepSetPreconditioner", 1, udata->myid)) return 1;
+  }
+  else if (uopt->nls == "fixedpoint")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = ARKStepSetNonlinearSolver(arkode_mem, NLS);
+    if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1;
+  }
+  else
+  {
+    fprintf(stderr, "\nERROR: ARK-DIRK is not compatible with the nls option provided\n");
+    return 1;
+  }
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = ARKStepEvolve(arkode_mem, tout, y, &t, ARK_NORMAL);
+    if (check_retval(&retval, "ARKStepEvolve", 1, udata->myid)) break;
+
+    /* Output state */
+    if(uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* close output stream */
+  if (uopt->save) fclose(DFID);
+
+  /* Get final statistics */
+  retval = ARKStepGetNumSteps(arkode_mem, &nst);
+  check_retval(&retval, "ARKStepGetNumSteps", 1, udata->myid);
+  retval = ARKStepGetNumStepAttempts(arkode_mem, &nst_a);
+  check_retval(&retval, "ARKStepGetNumStepAttempts", 1, udata->myid);
+  retval = ARKStepGetNumRhsEvals(arkode_mem, &nfe, &nfi);
+  check_retval(&retval, "ARKStepGetNumRhsEvals", 1, udata->myid);
+  retval = ARKStepGetNumErrTestFails(arkode_mem, &netf);
+  check_retval(&retval, "ARKStepGetNumErrTestFails", 1, udata->myid);
+  retval = ARKStepGetNumNonlinSolvIters(arkode_mem, &nni);
+  check_retval(&retval, "ARKStepGetNumNonlinSolvIters", 1, udata->myid);
+  retval = ARKStepGetNumNonlinSolvConvFails(arkode_mem, &ncnf);
+  check_retval(&retval, "ARKStepGetNumNonlinSolvConvFails", 1, udata->myid);
+  if (uopt->nls == "newton")
+  {
+    retval = ARKStepGetNumLinIters(arkode_mem, &nli);
+    check_retval(&retval, "ARKStepGetNumLinIters", 1, udata->myid);
+    retval = ARKStepGetNumPrecSolves(arkode_mem, &npsol);
+    check_retval(&retval, "ARKStepGetNumPrecSolves", 1, udata->myid);
+  }
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li (attempted = %li)\n", nst, nst_a);
+    printf("   Total RHS evals:  Fe = %li,  Fi = %li\n", nfe, nfi + udata->nnlfi);
+    printf("   Total number of error test failures = %li\n", netf);
+    printf("   Total number of nonlinear solver convergence failures = %li\n",
+           ncnf);
+    printf("   Total number of nonlinear iterations = %li\n", nni);
+    if (uopt->nls == "newton")
+    {
+      printf("   Total number of linear iterations = %li\n", nli);
+      printf("   Total number of preconditioner solves = %li\n", npsol);
+    }
+  }
+
+  /* Clean up */
+  ARKStepFree(&arkode_mem);
+  SUNNonlinSolFree(NLS);
+  if (LS) SUNLinSolFree(LS);
+
+  /* Return success */
+  return(0);
+}
+
+
+/* Setup ARKODE and evolve problem in time with IMEX method */
+int EvolveProblemIMEX(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*              arkode_mem = NULL;  /* empty ARKODE memory structure    */
+  SUNNonlinearSolver NLS = NULL;         /* empty nonlinear solver structure */
+  SUNLinearSolver    LS  = NULL;         /* empty linear solver structure    */
+
+  realtype t, dtout, tout;    /* current/output time data     */
+  int      retval;            /* reusable error-checking flag */
+  int      iout;              /* output counter               */
+  long int nst, nst_a, netf;  /* step stats                   */
+  long int nfe, nfi;          /* RHS stats                    */
+  long int nni, ncnf;         /* nonlinear solver stats       */
+  long int nli, npsol;        /* linear solver stats          */
+  FILE*    DFID = NULL;       /* diagnostics output file      */
+  char     fname[MXSTR];
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = false;
+
+  /* Create the ARK timestepper module */
+  arkode_mem = ARKStepCreate(Advection, Reaction, uopt->t0, y, udata->ctx);
+  if (check_retval((void*)arkode_mem, "ARKStepCreate", 0, udata->myid)) return 1;
+
+  /* Select the method order */
+  retval = ARKStepSetOrder(arkode_mem, uopt->order);
+  if (check_retval(&retval, "ARKStepSetOrder", 1, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = ARKStepSetUserData(arkode_mem, (void*) udata);
+  if (check_retval(&retval, "ARKStepSetUserData*", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = ARKStepSStolerances(arkode_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "ARKStepSStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = ARKStepSetMaxNumSteps(arkode_mem, 100000);
+  if (check_retval(&retval, "ARKStepSetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Open output file for integrator diagnostics */
+  if (uopt->save)
+  {
+    sprintf(fname, "%s/diagnostics.%06d.txt", uopt->outputdir, udata->myid);
+    DFID = fopen(fname, "w");
+
+    retval = ARKStepSetDiagnostics(arkode_mem, DFID);
+    if (check_retval(&retval, "ARKStepSetDiagnostics", 1, udata->myid)) return 1;
+  }
+
+  /* Create the (non)linear solver */
+  if (uopt->nls == "newton")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_Newton(y, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = ARKStepSetNonlinearSolver(arkode_mem, NLS);
+    if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1;
+
+    /* Create linear solver */
+    LS = SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx);
+    if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1;
+
+    /* Attach linear solver */
+    retval = ARKStepSetLinearSolver(arkode_mem, LS, NULL);
+    if (check_retval(&retval, "ARKStepSetLinearSolver", 1, udata->myid)) return 1;
+
+    /* Attach preconditioner */
+    retval = ARKStepSetPreconditioner(arkode_mem, NULL, PSolve);
+    if (check_retval(&retval, "ARKStepSetPreconditioner", 1, udata->myid)) return 1;
+  }
+  else if (uopt->nls == "tl-newton")
+  {
+    /* The custom task-local nonlinear solver handles the linear solve
+       as well, so we do not need a SUNLinearSolver. */
+    NLS = TaskLocalNewton(udata->ctx, y, DFID);
+    if (check_retval((void *)NLS, "TaskLocalNewton", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = ARKStepSetNonlinearSolver(arkode_mem, NLS);
+    if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1;
+  }
+  else if (uopt->nls == "fixedpoint")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = ARKStepSetNonlinearSolver(arkode_mem, NLS);
+    if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1;
+  }
+  else
+  {
+    fprintf(stderr, "\nERROR: ARK-IMEX method is not compatible with the nls option provided\n");
+    return 1;
+  }
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = ARKStepEvolve(arkode_mem, tout, y, &t, ARK_NORMAL);
+    if (check_retval(&retval, "ARKStepEvolve", 1, udata->myid)) break;
+
+    /* Output state */
+    if(uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* close output stream */
+  if (uopt->save) fclose(DFID);
+
+  /* Get final statistics */
+  retval = ARKStepGetNumSteps(arkode_mem, &nst);
+  check_retval(&retval, "ARKStepGetNumSteps", 1, udata->myid);
+  retval = ARKStepGetNumStepAttempts(arkode_mem, &nst_a);
+  check_retval(&retval, "ARKStepGetNumStepAttempts", 1, udata->myid);
+  retval = ARKStepGetNumRhsEvals(arkode_mem, &nfe, &nfi);
+  check_retval(&retval, "ARKStepGetNumRhsEvals", 1, udata->myid);
+  retval = ARKStepGetNumErrTestFails(arkode_mem, &netf);
+  check_retval(&retval, "ARKStepGetNumErrTestFails", 1, udata->myid);
+  retval = ARKStepGetNumNonlinSolvIters(arkode_mem, &nni);
+  check_retval(&retval, "ARKStepGetNumNonlinSolvIters", 1, udata->myid);
+  retval = ARKStepGetNumNonlinSolvConvFails(arkode_mem, &ncnf);
+  check_retval(&retval, "ARKStepGetNumNonlinSolvConvFails", 1, udata->myid);
+  if (uopt->nls == "newton")
+  {
+    retval = ARKStepGetNumLinIters(arkode_mem, &nli);
+    check_retval(&retval, "ARKStepGetNumLinIters", 1, udata->myid);
+    retval = ARKStepGetNumPrecSolves(arkode_mem, &npsol);
+    check_retval(&retval, "ARKStepGetNumPrecSolves", 1, udata->myid);
+  }
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li (attempted = %li)\n", nst, nst_a);
+    printf("   Total RHS evals:  Fe = %li,  Fi = %li\n", nfe, nfi + udata->nnlfi);
+    printf("   Total number of error test failures = %li\n", netf);
+    printf("   Total number of nonlinear solver convergence failures = %li\n",
+           ncnf);
+    printf("   Total number of nonlinear iterations = %li\n", nni);
+    if (uopt->nls == "newton")
+    {
+      printf("   Total number of linear iterations = %li\n", nli);
+      printf("   Total number of preconditioner solves = %li\n", npsol);
+    }
+  }
+
+  /* Clean up */
+  ARKStepFree(&arkode_mem);
+  if (NLS) SUNNonlinSolFree(NLS);
+  if (LS) SUNLinSolFree(LS);
+
+  /* Return success */
+  return(0);
+}
+
+
+/* Setup ARKODE and evolve problem in time explicitly */
+int EvolveProblemExplicit(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*    arkode_mem = NULL; /* empty ARKODE memory structure */
+  realtype   t, dtout, tout;    /* current/output time data      */
+  int      retval;            /* reusable error-checking flag  */
+  int      iout;              /* output counter                */
+  long int nst, nst_a, netf;  /* step stats                    */
+  long int nfe;               /* RHS stats                     */
+  FILE*    DFID;              /* diagnostics output file       */
+  char     fname[MXSTR];
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = true;
+
+  /* Create the ERK timestepper module */
+  arkode_mem = ERKStepCreate(AdvectionReaction, uopt->t0, y, udata->ctx);
+  if (check_retval((void*)arkode_mem, "ERKStepCreate", 0, udata->myid)) return 1;
+
+  /* Select the method order */
+  retval = ERKStepSetOrder(arkode_mem, uopt->order);
+  if (check_retval(&retval, "ERKStepSetOrder", 1, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = ERKStepSetUserData(arkode_mem, (void*) udata);
+  if (check_retval(&retval, "ERKStepSetUserData", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = ERKStepSStolerances(arkode_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "ERKStepSStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = ERKStepSetMaxNumSteps(arkode_mem, 1000000);
+  if (check_retval(&retval, "ERKStepSetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Set fixed step size */
+  retval = ERKStepSetFixedStep(arkode_mem, 1e-5);
+  if (check_retval(&retval, "ERKStepSetFixedStep", 1, udata->myid)) return 1;
+
+  /* Open output file for integrator diagnostics */
+  if (uopt->save)
+  {
+    sprintf(fname, "%s/diagnostics.%06d.txt", uopt->outputdir, udata->myid);
+    DFID = fopen(fname, "w");
+
+    retval = ERKStepSetDiagnostics(arkode_mem, DFID);
+    if (check_retval(&retval, "ERKStepSetDiagnostics", 1, udata->myid)) return 1;
+  }
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = ERKStepEvolve(arkode_mem, tout, y, &t, ARK_NORMAL);
+    if (check_retval(&retval, "ERKStepEvolve", 1, udata->myid)) break;
+
+    /* Output state */
+    if(uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* close output stream */
+  if (uopt->save) fclose(DFID);
+
+  /* Get final statistics */
+  retval = ERKStepGetNumSteps(arkode_mem, &nst);
+  check_retval(&retval, "ERKStepGetNumSteps", 1, udata->myid);
+  retval = ERKStepGetNumStepAttempts(arkode_mem, &nst_a);
+  check_retval(&retval, "ERKStepGetNumStepAttempts", 1, udata->myid);
+  retval = ERKStepGetNumRhsEvals(arkode_mem, &nfe);
+  check_retval(&retval, "ERKStepGetNumRhsEvals", 1, udata->myid);
+  retval = ERKStepGetNumErrTestFails(arkode_mem, &netf);
+  check_retval(&retval, "ERKStepGetNumErrTestFails", 1, udata->myid);
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li (attempted = %li)\n", nst, nst_a);
+    printf("   Total RHS evals:  Fe = %li\n", nfe);
+    printf("   Total number of error test failures = %li\n", netf);
+  }
+
+  /* Clean up */
+  ERKStepFree(&arkode_mem);
+
+  /* Return success */
+  return(0);
+}
+
+
+/* --------------------------------------------------------------
+ * (Non)linear system functions
+ * --------------------------------------------------------------*/
+
+int TaskLocalNlsResidual(N_Vector ycor, N_Vector F, void* arkode_mem)
+{
+  /* temporary variables */
+  UserData* udata;
+  int      retval;
+  realtype   c[3];
+  N_Vector X[3];
+
+  /* nonlinear system data */
+  N_Vector z, zpred, Fi, sdata;
+  realtype   tcur, gamma;
+  void     *user_data;
+
+  ARKStepGetNonlinearSystemData(arkode_mem, &tcur, &zpred, &z, &Fi,
+                                &gamma, &sdata, &user_data);
+  udata = (UserData*) user_data;
+
+  /* update 'z' value as stored predictor + current corrector */
+  N_VLinearSum(1.0, N_VGetLocalVector_MPIPlusX(zpred),
+               1.0, (ycor),
+               N_VGetLocalVector_MPIPlusX(z));
+
+  /* compute implicit RHS and save for later */
+  retval = Reaction(tcur,
+                    N_VGetLocalVector_MPIPlusX(z),
+                    N_VGetLocalVector_MPIPlusX(Fi),
+                    user_data);
+  udata->nnlfi++; /* count calls to Fi as part of the nonlinear residual */
+  if (retval < 0) return(-1);
+  if (retval > 0) return(+1);
+
+  /* update with y, sdata, and gamma * fy */
+  X[0] = ycor;
+  c[0] = 1.0;
+  c[1] = -1.0;
+  X[1] = N_VGetLocalVector_MPIPlusX(sdata);
+  c[2] = -gamma;
+  X[2] = N_VGetLocalVector_MPIPlusX(Fi);
+
+  retval = N_VLinearCombination(3, c, X, F);
+  if (retval != 0) return(-1);
+
+  return(0);
+}
+
+
+int TaskLocalLSolve(N_Vector delta, void* arkode_mem)
+{
+  /* local variables */
+  UserData* udata = NULL;
+  int       retval;
+
+  /* nonlinear system data */
+  N_Vector z, zpred, Fi, sdata;
+  realtype tcur, gamma;
+  void*    user_data = NULL;
+
+  ARKStepGetNonlinearSystemData(arkode_mem, &tcur, &zpred, &z, &Fi,
+                                &gamma, &sdata, &user_data);
+  udata = (UserData*) user_data;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* set up I - gamma*J and solve */
+  retval = SolveReactionLinSys(z, delta, delta, gamma, udata);
+
+
+  return(retval);
+}
+
+
+SUNNonlinearSolver_Type TaskLocalNewton_GetType(SUNNonlinearSolver NLS)
+{
+  return SUNNONLINEARSOLVER_ROOTFIND;
+}
+
+
+int TaskLocalNewton_Initialize(SUNNonlinearSolver NLS)
+{
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return SUN_NLS_MEM_NULL;
+
+  /* override default system and lsolve functions with local versions */
+  SUNNonlinSolSetSysFn(LOCAL_NLS(NLS), TaskLocalNlsResidual);
+  SUNNonlinSolSetLSolveFn(LOCAL_NLS(NLS), TaskLocalLSolve);
+
+  return(SUNNonlinSolInitialize(LOCAL_NLS(NLS)));
+}
+
+
+int TaskLocalNewton_Solve(SUNNonlinearSolver NLS,
+                          N_Vector y0, N_Vector ycor,
+                          N_Vector w, realtype tol,
+                          booleantype callLSetup, void* mem)
+{
+  /* local variables */
+  MPI_Comm comm;
+  int solve_status, recover, nonrecover;
+
+  /* check that the inputs are non-null */
+  if ((NLS  == NULL) ||
+      (y0   == NULL) ||
+      (ycor == NULL) ||
+      (w    == NULL) ||
+      (mem  == NULL))
+    return SUN_NLS_MEM_NULL;
+
+  /* shortcuts */
+  comm = GET_NLS_CONTENT(NLS)->comm;
+
+  /* each tasks solves the local nonlinear system */
+  solve_status = SUNNonlinSolSolve(LOCAL_NLS(NLS),
+                                   N_VGetLocalVector_MPIPlusX(y0),
+                                   N_VGetLocalVector_MPIPlusX(ycor),
+                                   N_VGetLocalVector_MPIPlusX(w),
+                                   tol, callLSetup, mem);
+
+  /* if any process had a nonrecoverable failure, return it */
+  MPI_Allreduce(&solve_status, &nonrecover, 1, MPI_INT, MPI_MIN, comm);
+  if (nonrecover < 0) return nonrecover;
+
+  /* check if any process has a recoverable convergence failure */
+  MPI_Allreduce(&solve_status, &recover, 1, MPI_INT, MPI_MAX, comm);
+  if (recover == SUN_NLS_CONV_RECVR) GET_NLS_CONTENT(NLS)->ncnf++;
+
+  /* return success (recover == 0) or a recoverable error code (recover > 0) */
+  return recover;
+}
+
+
+int TaskLocalNewton_Free(SUNNonlinearSolver NLS)
+{
+  /* return if NLS is already free */
+  if (NLS == NULL)
+    return SUN_NLS_SUCCESS;
+
+  /* free items from contents, then the generic structure */
+  if (NLS->content)
+  {
+    SUNNonlinSolFree(LOCAL_NLS(NLS));
+    free(NLS->content);
+    NLS->content = NULL;
+  }
+
+  /* free the ops structure */
+  if (NLS->ops)
+  {
+    free(NLS->ops);
+    NLS->ops = NULL;
+  }
+
+  /* free the nonlinear solver */
+  free(NLS);
+
+  return SUN_NLS_SUCCESS;
+}
+
+
+int TaskLocalNewton_SetSysFn(SUNNonlinearSolver NLS,
+                             SUNNonlinSolSysFn SysFn)
+{
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return SUN_NLS_MEM_NULL;
+
+  return(SUNNonlinSolSetSysFn(LOCAL_NLS(NLS), SysFn));
+}
+
+
+int TaskLocalNewton_SetConvTestFn(SUNNonlinearSolver NLS,
+                                  SUNNonlinSolConvTestFn CTestFn,
+                                  void* ctest_data)
+{
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return SUN_NLS_MEM_NULL;
+
+  return(SUNNonlinSolSetConvTestFn(LOCAL_NLS(NLS), CTestFn, ctest_data));
+}
+
+
+int TaskLocalNewton_GetNumConvFails(SUNNonlinearSolver NLS,
+                                    long int *nconvfails)
+{
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return SUN_NLS_MEM_NULL;
+
+  *nconvfails = GET_NLS_CONTENT(NLS)->ncnf;
+  return(0);
+}
+
+
+SUNNonlinearSolver TaskLocalNewton(SUNContext ctx, N_Vector y, FILE* DFID)
+{
+  SUNNonlinearSolver NLS;
+  TaskLocalNewton_Content content;
+
+  /* Check that the supplied N_Vector is non-NULL */
+  if (y == NULL) return NULL;
+
+  /* Check that the supplied N_Vector is an MPIPlusX */
+  if (N_VGetVectorID(y) != SUNDIALS_NVEC_MPIPLUSX)
+    return NULL;
+
+  /* Create an empty nonlinear linear solver object */
+  NLS = SUNNonlinSolNewEmpty(ctx);
+  if (NLS == NULL) return NULL;
+
+  /* Attach operations */
+  NLS->ops->gettype         = TaskLocalNewton_GetType;
+  NLS->ops->initialize      = TaskLocalNewton_Initialize;
+  NLS->ops->solve           = TaskLocalNewton_Solve;
+  NLS->ops->free            = TaskLocalNewton_Free;
+  NLS->ops->setsysfn        = TaskLocalNewton_SetSysFn;
+  NLS->ops->setctestfn      = TaskLocalNewton_SetConvTestFn;
+  NLS->ops->getnumconvfails = TaskLocalNewton_GetNumConvFails;
+
+  /* Create content */
+  content = NULL;
+  content = (TaskLocalNewton_Content) malloc(sizeof *content);
+  if (content == NULL) { SUNNonlinSolFree(NLS); return NULL; }
+
+  /* Initialize all components of content to 0/NULL */
+  memset(content, 0, sizeof(*content));
+
+  /* Attach content */
+  NLS->content = content;
+
+  /* Fill general content */
+  void *tmpcomm = N_VGetCommunicator(y);
+  if (tmpcomm == NULL) { SUNNonlinSolFree(NLS); return NULL; }
+
+  MPI_Comm *comm = (MPI_Comm*) tmpcomm;
+  if ((*comm) == MPI_COMM_NULL) { SUNNonlinSolFree(NLS); return NULL; }
+
+  content->comm = *comm;
+
+  content->local_nls = SUNNonlinSol_Newton(N_VGetLocalVector_MPIPlusX(y), ctx);
+  if (content->local_nls == NULL) { SUNNonlinSolFree(NLS); return NULL; }
+
+  MPI_Comm_rank(content->comm, &content->myid);
+  MPI_Comm_size(content->comm, &content->nprocs);
+
+  content->ncnf = 0;
+
+  /* Setup the local nonlinear solver monitoring */
+  if (DFID != NULL)
+  {
+    SUNNonlinSolSetInfoFile_Newton(LOCAL_NLS(NLS), DFID);
+    SUNNonlinSolSetPrintLevel_Newton(LOCAL_NLS(NLS), 1);
+  }
+
+  return NLS;
+}
diff --git a/benchmarks/advection_reaction_3D/backends.hpp b/benchmarks/advection_reaction_3D/raja/backends.hpp
similarity index 100%
rename from benchmarks/advection_reaction_3D/backends.hpp
rename to benchmarks/advection_reaction_3D/raja/backends.hpp
diff --git a/benchmarks/advection_reaction_3D/raja/check_retval.h b/benchmarks/advection_reaction_3D/raja/check_retval.h
new file mode 100644
index 0000000000..887b7cea5d
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/check_retval.h
@@ -0,0 +1,57 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * ---------------------------------------------------------------------------*/
+
+#ifndef _SUNDIALS_CHECK_RETVAL_H_
+#define _SUNDIALS_CHECK_RETVAL_H_
+
+#include <stdio.h>
+
+/* --------------------------------------------------------------
+ * Function to check return values:
+ *
+ * opt == 0  means the function allocates memory and returns a
+ *           pointer so check if a NULL pointer was returned
+ * opt == 1  means the function returns an integer where a
+ *           value < 0 indicates an error occured
+ * --------------------------------------------------------------*/
+static int check_retval(void *returnvalue, const char *funcname, int opt, int myid)
+{
+  int* errvalue;
+
+  if (opt == 0 && returnvalue == NULL)
+  {
+    /* A NULL pointer was returned - no memory allocated */
+    if (myid == 0)
+      fprintf(stderr, "\nERROR: %s() failed - returned NULL pointer\n\n",
+              funcname);
+    return(1);
+  }
+  else if (opt == 1)
+  {
+    errvalue = (int *) returnvalue;
+
+    /* A value < 0 was returned - function failed */
+    if (*errvalue < 0)
+    {
+      if (myid == 0)
+        fprintf(stderr, "\nERROR: %s() returned %d\n\n", funcname, *errvalue);
+      return(1);
+    }
+  }
+
+  /* return success */
+  return(0);
+}
+
+#endif
diff --git a/benchmarks/advection_reaction_3D/raja/cvode_driver.cpp b/benchmarks/advection_reaction_3D/raja/cvode_driver.cpp
new file mode 100644
index 0000000000..e147ccd8c4
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/cvode_driver.cpp
@@ -0,0 +1,289 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * ---------------------------------------------------------------------------*/
+
+#include "cvode/cvode.h"
+#include "sunlinsol/sunlinsol_spgmr.h"
+#include "sunnonlinsol/sunnonlinsol_newton.h"
+#include "sunnonlinsol/sunnonlinsol_fixedpoint.h"
+#include "advection_reaction_3D.hpp"
+#include "rhs3D.hpp"
+
+
+/* Setup CVODE and evolve problem in time with BDF method */
+int EvolveProblemBDF(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*              cvode_mem = NULL;   /* empty CVODE memory structure    */
+  SUNNonlinearSolver NLS = NULL;         /* empty nonlinear solver structure */
+  SUNLinearSolver    LS  = NULL;         /* empty linear solver structure    */
+
+  realtype t, dtout, tout;    /* current/output time data     */
+  int      retval;            /* reusable error-checking flag */
+  int      iout;              /* output counter               */
+  long int nst, netf;         /* step stats                   */
+  long int nfi;               /* RHS stats                    */
+  long int nni, ncnf;         /* nonlinear solver stats       */
+  long int nli, npsol;        /* linear solver stats          */
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = true;
+
+  /* Create CVode */
+  cvode_mem = CVodeCreate(CV_BDF, udata->ctx);
+  if (check_retval((void*)cvode_mem, "CVodeCreate", 0, udata->myid)) return 1;
+
+  /* Initialize CVode */
+  retval = CVodeInit(cvode_mem, AdvectionReaction, uopt->t0, y);
+  if (check_retval((void*)cvode_mem, "CVodeInit", 0, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = CVodeSetUserData(cvode_mem, (void*) udata);
+  if (check_retval(&retval, "CVodeSetUserData*", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = CVodeSStolerances(cvode_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "CVodeSStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = CVodeSetMaxNumSteps(cvode_mem, 100000);
+  if (check_retval(&retval, "CVodeSetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Create the (non)linear solver */
+  if (uopt->nls == "newton")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_Newton(y, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = CVodeSetNonlinearSolver(cvode_mem, NLS);
+    if (check_retval(&retval, "CVodeSetNonlinearSolver", 1, udata->myid)) return 1;
+
+    /* Create linear solver */
+    LS = uopt->precond ? SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx) : SUNLinSol_SPGMR(y, PREC_NONE, 0, udata->ctx);
+    if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1;
+
+    /* Attach linear solver */
+    retval = CVodeSetLinearSolver(cvode_mem, LS, NULL);
+    if (check_retval(&retval, "CVodeSetLinearSolver", 1, udata->myid)) return 1;
+
+    /* Attach preconditioner */
+    retval = CVodeSetPreconditioner(cvode_mem, NULL, PSolve);
+    if (check_retval(&retval, "CVodeSetPreconditioner", 1, udata->myid)) return 1;
+  }
+  else if (uopt->nls == "fixedpoint")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = CVodeSetNonlinearSolver(cvode_mem, NLS);
+    if (check_retval(&retval, "CVodeSetNonlinearSolver", 1, udata->myid)) return 1;
+  }
+  else
+  {
+    fprintf(stderr, "\nERROR: CV-BDF method is not compatible with the nls option provided\n");
+    return 1;
+  }
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = CVode(cvode_mem, tout, y, &t, CV_NORMAL);
+    if (check_retval(&retval, "CVode", 1, udata->myid)) break;
+
+    /* Output state */
+    if (uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* Get final statistics */
+  retval = CVodeGetNumSteps(cvode_mem, &nst);
+  check_retval(&retval, "CVodeGetNumSteps", 1, udata->myid);
+  retval = CVodeGetNumRhsEvals(cvode_mem, &nfi);
+  check_retval(&retval, "CVodeGetNumRhsEvals", 1, udata->myid);
+  retval = CVodeGetNumErrTestFails(cvode_mem, &netf);
+  check_retval(&retval, "CVodeGetNumErrTestFails", 1, udata->myid);
+  retval = CVodeGetNumNonlinSolvIters(cvode_mem, &nni);
+  check_retval(&retval, "CVodeGetNumNonlinSolvIters", 1, udata->myid);
+  retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncnf);
+  check_retval(&retval, "CVodeGetNumNonlinSolvConvFails", 1, udata->myid);
+  if (uopt->nls == "newton")
+  {
+    retval = CVodeGetNumLinIters(cvode_mem, &nli);
+    check_retval(&retval, "CVodeGetNumLinIters", 1, udata->myid);
+    retval = CVodeGetNumPrecSolves(cvode_mem, &npsol);
+    check_retval(&retval, "CVodeGetNumPrecSolves", 1, udata->myid);
+  }
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li\n", nst);
+    printf("   Total RHS evals: %li\n", nfi + udata->nnlfi);
+    printf("   Total number of error test failures = %li\n", netf);
+    printf("   Total number of nonlinear solver convergence failures = %li\n",
+           ncnf);
+    printf("   Total number of nonlinear iterations = %li\n", nni);
+    if (uopt->nls == "newton")
+    {
+      printf("   Total number of linear iterations = %li\n", nli);
+      printf("   Total number of preconditioner solves = %li\n", npsol);
+    }
+  }
+
+  /* Clean up */
+  CVodeFree(&cvode_mem);
+  if (NLS) SUNNonlinSolFree(NLS);
+  if (LS)  SUNLinSolFree(LS);
+
+  /* Return success */
+  return(0);
+}
+
+
+/* Setup CVODE and evolve problem in time with Adams method */
+int EvolveProblemAdams(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*              cvode_mem = NULL;   /* empty CVODE memory structure    */
+  SUNNonlinearSolver NLS = NULL;         /* empty nonlinear solver structure */
+
+  realtype t, dtout, tout;    /* current/output time data     */
+  int      retval;            /* reusable error-checking flag */
+  int      iout;              /* output counter               */
+  long int nst, netf;         /* step stats                   */
+  long int nfi;               /* RHS stats                    */
+  long int nni, ncnf;         /* nonlinear solver stats       */
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = true;
+
+  /* Create CVode */
+  cvode_mem = CVodeCreate(CV_ADAMS, udata->ctx);
+  if (check_retval((void*)cvode_mem, "CVodeCreate", 0, udata->myid)) return 1;
+
+  /* Initialize CVode */
+  retval = CVodeInit(cvode_mem, AdvectionReaction, uopt->t0, y);
+  if (check_retval((void*)cvode_mem, "CVodeInit", 0, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = CVodeSetUserData(cvode_mem, (void*) udata);
+  if (check_retval(&retval, "CVodeSetUserData*", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = CVodeSStolerances(cvode_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "CVodeSStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = CVodeSetMaxNumSteps(cvode_mem, 100000);
+  if (check_retval(&retval, "CVodeSetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Create nonlinear solver */
+  NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx);
+  if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1;
+
+  /* Attach nonlinear solver */
+  retval = CVodeSetNonlinearSolver(cvode_mem, NLS);
+  if (check_retval(&retval, "CVodeSetNonlinearSolver", 1, udata->myid)) return 1;
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = CVode(cvode_mem, tout, y, &t, CV_NORMAL);
+    if (check_retval(&retval, "CVode", 1, udata->myid)) break;
+
+    /* Output state */
+    if (uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* Get final statistics */
+  retval = CVodeGetNumSteps(cvode_mem, &nst);
+  check_retval(&retval, "CVodeGetNumSteps", 1, udata->myid);
+  retval = CVodeGetNumRhsEvals(cvode_mem, &nfi);
+  check_retval(&retval, "CVodeGetNumRhsEvals", 1, udata->myid);
+  retval = CVodeGetNumErrTestFails(cvode_mem, &netf);
+  check_retval(&retval, "CVodeGetNumErrTestFails", 1, udata->myid);
+  retval = CVodeGetNumNonlinSolvIters(cvode_mem, &nni);
+  check_retval(&retval, "CVodeGetNumNonlinSolvIters", 1, udata->myid);
+  retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncnf);
+  check_retval(&retval, "CVodeGetNumNonlinSolvConvFails", 1, udata->myid);
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li\n", nst);
+    printf("   Total RHS evals: %li\n", nfi + udata->nnlfi);
+    printf("   Total number of error test failures = %li\n", netf);
+    printf("   Total number of nonlinear solver convergence failures = %li\n",
+           ncnf);
+  }
+
+  /* Clean up */
+  CVodeFree(&cvode_mem);
+  SUNNonlinSolFree(NLS);
+
+  /* Return success */
+  return(0);
+}
diff --git a/benchmarks/advection_reaction_3D/raja/ida_driver.cpp b/benchmarks/advection_reaction_3D/raja/ida_driver.cpp
new file mode 100644
index 0000000000..3ae28a43ca
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/ida_driver.cpp
@@ -0,0 +1,195 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * ---------------------------------------------------------------------------*/
+
+#include "ida/ida.h"
+#include "sunlinsol/sunlinsol_spgmr.h"
+#include "sunnonlinsol/sunnonlinsol_newton.h"
+#include "sunnonlinsol/sunnonlinsol_fixedpoint.h"
+#include "advection_reaction_3D.hpp"
+#include "rhs3D.hpp"
+
+
+/* Initial condition function */
+int SetICDot(N_Vector y, N_Vector yp, UserData* udata)
+{
+  int retval;
+
+  retval = AdvectionReaction(0, y, yp, (void*)udata);
+  if (check_retval(&retval, "AdvectionReaction", 1, udata->myid)) return 1;
+
+  /* Return success */
+  return(0);
+}
+
+
+/* Setup IDA and evolve problem in time with BDF method */
+int EvolveDAEProblem(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*              ida_mem = NULL;  /* empty IDA memory structure       */
+  SUNNonlinearSolver NLS = NULL;      /* empty nonlinear solver structure */
+  SUNLinearSolver    LS  = NULL;      /* empty linear solver structure    */
+  N_Vector           yp  = NULL;      /* empty vector structure           */
+
+  realtype t, dtout, tout;    /* current/output time data     */
+  int      retval;            /* reusable error-checking flag */
+  int      iout;              /* output counter               */
+  long int nst, netf;         /* step stats                   */
+  long int nfi;               /* RHS stats                    */
+  long int nni, ncnf;         /* nonlinear solver stats       */
+  long int nli, npsol;        /* linear solver stats          */
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = true;
+
+  /* Create ydot' vector */
+  yp = N_VClone(y);
+  if (check_retval((void*)yp, "N_VClone", 0, udata->myid)) return 1;
+
+  /* Create IDA */
+  ida_mem = IDACreate(udata->ctx);
+  if (check_retval((void*)ida_mem, "IDACreate", 0, udata->myid)) return 1;
+
+  /* Initialize IDA */
+  retval = IDAInit(ida_mem, AdvectionReactionResidual, uopt->t0, y, yp);
+  if (check_retval(&retval, "IDAInit", 1, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = IDASetUserData(ida_mem, (void*) udata);
+  if (check_retval(&retval, "IDASetUserData*", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = IDASStolerances(ida_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "IDASStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = IDASetMaxNumSteps(ida_mem, 100000);
+  if (check_retval(&retval, "IDASetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Increase the max number of ETF allowed between outputs */
+  retval = IDASetMaxErrTestFails(ida_mem, 25);
+  if (check_retval(&retval, "IDASetMaxErrTestFails", 1, udata->myid)) return 1;
+
+  /* Create the (non)linear solver */
+  if (uopt->nls == "newton")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_Newton(y, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = IDASetNonlinearSolver(ida_mem, NLS);
+    if (check_retval(&retval, "IDASetNonlinearSolver", 1, udata->myid)) return 1;
+
+    /* Create linear solver */
+    LS = uopt->precond ? SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx) : SUNLinSol_SPGMR(y, PREC_NONE, 0, udata->ctx);
+    if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1;
+
+    /* Attach linear solver */
+    retval = IDASetLinearSolver(ida_mem, LS, NULL);
+    if (check_retval(&retval, "IDASetLinearSolver", 1, udata->myid)) return 1;
+
+    // /* Attach preconditioner */
+    retval = IDASetPreconditioner(ida_mem, NULL, PSolveRes);
+    if (check_retval(&retval, "IDASetPreconditioner", 1, udata->myid)) return 1;
+  }
+  else
+  {
+    fprintf(stderr, "\nERROR: IDA method is not compatible with the nls option provided\n");
+    return 1;
+  }
+
+  /* Set ydot' initial condition */
+  retval = SetICDot(y, yp, udata);
+  if (check_retval(&retval, "SetICDot", 1, udata->myid)) return 1;
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = IDASolve(ida_mem, tout, &t, y, yp, IDA_NORMAL);
+    if (check_retval(&retval, "IDA", 1, udata->myid)) break;
+
+    /* Output state */
+    if(uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* Get final statistics */
+  retval = IDAGetNumSteps(ida_mem, &nst);
+  check_retval(&retval, "IDAGetNumSteps", 1, udata->myid);
+  retval = IDAGetNumResEvals(ida_mem, &nfi);
+  check_retval(&retval, "IDAGetNumResEvals", 1, udata->myid);
+  retval = IDAGetNumErrTestFails(ida_mem, &netf);
+  check_retval(&retval, "IDAGetNumErrTestFails", 1, udata->myid);
+  retval = IDAGetNumNonlinSolvIters(ida_mem, &nni);
+  check_retval(&retval, "IDAGetNumNonlinSolvIters", 1, udata->myid);
+  retval = IDAGetNumNonlinSolvConvFails(ida_mem, &ncnf);
+  check_retval(&retval, "IDAGetNumNonlinSolvConvFails", 1, udata->myid);
+  if (uopt->nls == "newton")
+  {
+    retval = IDAGetNumLinIters(ida_mem, &nli);
+    check_retval(&retval, "IDAGetNumLinIters", 1, udata->myid);
+    retval = IDAGetNumPrecSolves(ida_mem, &npsol);
+    check_retval(&retval, "IDAGetNumPrecSolves", 1, udata->myid);
+  }
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li\n", nst);
+    printf("   Total RHS evals: %li\n", nfi + udata->nnlfi);
+    printf("   Total number of error test failures = %li\n", netf);
+    printf("   Total number of nonlinear solver convergence failures = %li\n",
+           ncnf);
+    printf("   Total number of nonlinear iterations = %li\n", nni);
+    if (uopt->nls == "newton")
+    {
+      printf("   Total number of linear iterations = %li\n", nli);
+      printf("   Total number of preconditioner solves = %li\n", npsol);
+    }
+  }
+
+  /* Clean up */
+  IDAFree(&ida_mem);
+  if (yp) N_VDestroy(yp);
+  if (NLS) SUNNonlinSolFree(NLS);
+  if (LS)  SUNLinSolFree(LS);
+
+  /* Return success */
+  return(0);
+}
diff --git a/benchmarks/advection_reaction_3D/raja/rhs3D.hpp b/benchmarks/advection_reaction_3D/raja/rhs3D.hpp
new file mode 100644
index 0000000000..1bb2b6f105
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/rhs3D.hpp
@@ -0,0 +1,598 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL
+ *                Daniel R. Reynolds @ SMU
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------*/
+
+#ifndef ADVECTION_REACTION_3D_RHS_HPP
+#define ADVECTION_REACTION_3D_RHS_HPP
+
+#include "advection_reaction_3D.hpp"
+
+/* --------------------------------------------------------------
+ * Right hand side (RHS) and residual functions
+ * --------------------------------------------------------------*/
+
+/* Compute the advection term f(t,y) = -c (grad * y). This is done using
+   upwind 1st order finite differences. */
+static int Advection(realtype t, N_Vector y, N_Vector ydot, void* user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* set variable shortcuts */
+  const int      nxl = udata->grid->nxl;
+  const int      nyl = udata->grid->nyl;
+  const int      nzl = udata->grid->nzl;
+  const int      dof = udata->grid->dof;
+  const realtype c   = udata->c;
+  const realtype cx  = -c / udata->grid->dx;
+  const realtype cy  = -c / udata->grid->dy;
+  const realtype cz  = -c / udata->grid->dz;
+
+  /* local variables */
+  int retval;
+
+  /* fill send buffers and begin exchanging boundary information */
+  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
+  retval = FillSendBuffers(y, udata);
+  if (check_retval(&retval, "FillSendBuffers", 1, udata->myid))
+    return(-1);
+  retval = udata->grid->ExchangeStart();
+  if (check_retval(&retval, "ExchangeStart", 1, udata->myid))
+    return(-1);
+  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
+
+  /* set output to zero */
+  N_VConst(0.0, ydot);
+
+  /* create views of the state and RHS vectors */
+  RAJA::View<realtype, RAJA::Layout<4> > Yview(GetVecData(y), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4> > dYview(GetVecData(ydot), nxl, nyl, nzl, dof);
+
+  /* iterate over domain interior, computing advection */
+  if (c > 0.0)
+  {
+    /* flow moving in the positive x,y,z direction */
+    auto range = RAJA::make_tuple(RAJA::RangeSegment(1, nxl),
+                                  RAJA::RangeSegment(1, nyl),
+                                  RAJA::RangeSegment(1, nzl));
+    RAJA::kernel<XYZ_KERNEL_POL>(range,
+      [=] DEVICE_FUNC (int i, int j, int k) {
+      const realtype u_ijk = Yview(i,j,k,0);
+      const realtype v_ijk = Yview(i,j,k,1);
+      const realtype w_ijk = Yview(i,j,k,2);
+
+      // grad * u
+      dYview(i,j,k,0) =  cz * (u_ijk - Yview(i,j,k-1,0)); // du/dz
+      dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j-1,k,0)); // du/dy
+      dYview(i,j,k,0) += cx * (u_ijk - Yview(i-1,j,k,0)); // du/dx
+
+      // grad * v
+      dYview(i,j,k,1) =  cz * (v_ijk - Yview(i,j,k-1,1)); // dv/dz
+      dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j-1,k,1)); // dv/dy
+      dYview(i,j,k,1) += cx * (v_ijk - Yview(i-1,j,k,1)); // dv/dx
+
+      // grad * w
+      dYview(i,j,k,2) =  cz * (w_ijk - Yview(i,j,k-1,2)); // dw/dz
+      dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j-1,k,2)); // dw/dy
+      dYview(i,j,k,2) += cx * (w_ijk - Yview(i-1,j,k,2)); // dw/dx
+    });
+
+  }
+  else if (c < 0.0)
+  {
+    /* flow moving in the negative x,y,z direction */
+    auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1),
+                                  RAJA::RangeSegment(0, nyl-1),
+                                  RAJA::RangeSegment(0, nzl-1));
+    RAJA::kernel<XYZ_KERNEL_POL>(range,
+      [=] DEVICE_FUNC (int i, int j, int k) {
+      const realtype u_ijk = Yview(i,j,k,0);
+      const realtype v_ijk = Yview(i,j,k,1);
+      const realtype w_ijk = Yview(i,j,k,2);
+
+      // grad * u
+      dYview(i,j,k,0) =  cz * (Yview(i,j,k+1,0) - u_ijk); // du/dz
+      dYview(i,j,k,0) += cy * (Yview(i,j+1,k,0) - u_ijk); // du/dy
+      dYview(i,j,k,0) += cx * (Yview(i+1,j,k,0) - u_ijk); // du/dx
+
+      // grad * v
+      dYview(i,j,k,1) =  cz * (Yview(i,j,k+1,1) - v_ijk); // dv/dz
+      dYview(i,j,k,1) += cy * (Yview(i,j+1,k,1) - v_ijk); // dv/dy
+      dYview(i,j,k,1) += cx * (Yview(i+1,j,k,1) - v_ijk); // dv/dx
+
+      // grad * w
+      dYview(i,j,k,2) =  cz * (Yview(i,j,k+1,2) - w_ijk); // dw/dz
+      dYview(i,j,k,2) += cy * (Yview(i,j+1,k,2) - w_ijk); // dw/dy
+      dYview(i,j,k,2) += cx * (Yview(i+1,j,k,2) - w_ijk); // dw/dx
+    });
+
+  }
+
+  /* finish exchanging boundary information */
+  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
+  retval = udata->grid->ExchangeEnd();
+  if (check_retval(&retval, "ExchangeEnd", 1, udata->myid))
+    return(-1);
+  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
+
+
+  /* compute advection at process boundaries */
+  if (c > 0.0)
+  {
+    /* Flow moving in the positive x,y,z direction:
+    *  boundaries are west face, south face, back face */
+
+    /*   Perform calculations on each "lower" face */
+    RAJA::View<realtype, RAJA::Layout<3>>
+      Wrecv(udata->grid->getRecvBuffer("WEST"),  nyl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3>>
+      Srecv(udata->grid->getRecvBuffer("SOUTH"), nxl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3>>
+      Brecv(udata->grid->getRecvBuffer("BACK"),  nxl, nyl, dof);
+
+    auto west_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, nzl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(west_face,
+      [=] DEVICE_FUNC (int j, int k, int l) {
+      const int i = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,k,l);
+      const realtype YBack  = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - Wrecv(j,k,l)); // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - YSouth);       // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - YBack);        // d/dz
+    });
+
+    auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nzl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(south_face,
+      [=] DEVICE_FUNC (int i, int k, int l) {
+      const int j = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YWest = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(j,k,l);
+      const realtype YBack = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - YWest);        // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - Srecv(i,k,l)); // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - YBack);        // d/dz
+    });
+
+    auto back_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                      RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(back_face,
+      [=] DEVICE_FUNC (int i, int j, int l) {
+      const int k = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YWest  = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(j,k,l);
+      const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,k,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - YWest);        // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - YSouth);       // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - Brecv(i,j,l)); // d/dz
+    });
+
+  }
+  else if (c < 0.0)
+  {
+
+    /* Flow moving in the negative x,y,z direction:
+    *  boundaries are east face, north face, and front face */
+
+    /*   Perform calculations on each "upper" face */
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Erecv(udata->grid->getRecvBuffer("EAST"),  nyl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Nrecv(udata->grid->getRecvBuffer("NORTH"), nxl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Frecv(udata->grid->getRecvBuffer("FRONT"), nxl, nyl, dof);
+
+    auto east_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, nzl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(east_face,
+      [=] DEVICE_FUNC (int j, int k, int l) {
+      const int i = nxl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,k,l);
+      const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,l);
+      dYview(i,j,k,l)  = cx * (Erecv(j,k,l) - Yijkl); // d/dx
+      dYview(i,j,k,l) += cy * (YNorth - Yijkl);       // d/dy
+      dYview(i,j,k,l) += cz * (YFront - Yijkl);       // d/dz
+    });
+
+    auto north_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nzl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(north_face,
+      [=] DEVICE_FUNC (int i, int k, int l) {
+      const int j = nyl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YEast  = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(j,k,l);
+      const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,l);
+      dYview(i,j,k,l)  = cx * (YEast - Yijkl);        // d/dx
+      dYview(i,j,k,l) += cy * (Nrecv(i,k,l) - Yijkl); // d/dy
+      dYview(i,j,k,l) += cz * (YFront - Yijkl);       // d/dz
+    });
+
+    auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nyl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(front_face,
+      [=] DEVICE_FUNC (int i, int j, int l) {
+      const int k = nzl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YEast  = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(j,k,l);
+      const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,k,l);
+      dYview(i,j,k,l)  = cx * (YEast - Yijkl);        // d/dx
+      dYview(i,j,k,l) += cy * (YNorth - Yijkl);       // d/dy
+      dYview(i,j,k,l) += cz * (Frecv(i,j,l) - Yijkl); // d/dz
+    });
+  }
+
+  /* return success */
+  return(0);
+}
+
+
+/* Compute the reaction term g(t,y). */
+static int Reaction(realtype t, N_Vector y, N_Vector ydot, void* user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* set variable shortcuts */
+  const realtype A  = udata->A;
+  const realtype B  = udata->B;
+  const realtype k1 = udata->k1;
+  const realtype k2 = udata->k2;
+  const realtype k3 = udata->k3;
+  const realtype k4 = udata->k4;
+  const realtype k5 = udata->k5;
+  const realtype k6 = udata->k6;
+  const int     nxl = udata->grid->nxl;
+  const int     nyl = udata->grid->nyl;
+  const int     nzl = udata->grid->nzl;
+  const int     dof = udata->grid->dof;
+
+  /* Zero output if not adding reactions to existing RHS */
+  if (!udata->add_reactions)
+    N_VConst(0.0, ydot);
+
+  /* access data arrays */
+  realtype* Ydata  = NULL;
+  Ydata = GetVecData(y);
+  if (check_retval((void *)Ydata, "GetVecData", 0, udata->myid))
+    return(-1);
+  realtype* dYdata = NULL;
+  dYdata = GetVecData(ydot);
+  if (check_retval((void *)dYdata, "GetVecData", 0, udata->myid))
+    return(-1);
+
+  /* create 4D views of state and RHS vectors */
+  RAJA::View<realtype, RAJA::Layout<4> > Yview(GetVecData(y), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4> > dYview(GetVecData(ydot), nxl, nyl, nzl, dof);
+
+  /* add reaction terms to RHS */
+  auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                RAJA::RangeSegment(0, nyl),
+                                RAJA::RangeSegment(0, nzl));
+  RAJA::kernel<XYZ_KERNEL_POL>(range,
+    [=] DEVICE_FUNC (int i, int j, int k) {
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+    dYview(i,j,k,0) += k1 * A - k2 * w * u + k3 * u * u * v - k4 * u;
+    dYview(i,j,k,1) += k2 * w * u - k3 * u * u * v;
+    dYview(i,j,k,2) += -k2 * w * u + k5 * B - k6 * w;
+  });
+
+  /* return success */
+  return(0);
+}
+
+
+/* Compute the RHS as h(t,y) = f(t,y) + g(t,y). */
+static int AdvectionReaction(realtype t, N_Vector y, N_Vector ydot,
+                             void *user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+  int retval;
+
+  /* NOTE: The order in which Advection and Reaction are called
+           is critical here. Advection must be computed first. */
+  retval = Advection(t, y, ydot, user_data);
+  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
+
+  retval = Reaction(t, y, ydot, user_data);
+  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
+
+  /* return success */
+  return(0);
+}
+
+/* Compute the residual F(t,y,y') = ydot - h(t,y) = 0. */
+static int AdvectionReactionResidual(realtype t, N_Vector y, N_Vector ydot,
+                                     N_Vector F, void *user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+  int retval;
+
+  /* NOTE: The order in which Advection and Reaction are called
+           is critical here. Advection must be computed first. */
+  retval = Advection(t, y, F, user_data); /* F = -c y_x */
+  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
+
+  retval = Reaction(t, y, F, user_data);  /* F = -c y_x + g(t,y) */
+  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
+
+  /* F = ydot - h(t,y) = ydot + c y_x - g(t,y) */
+  N_VLinearSum(1.0, ydot, -1.0, F, F);
+
+  /* return success */
+  return(0);
+}
+
+/* --------------------------------------------------------------
+ * Linear system and Jacobian functions
+ * --------------------------------------------------------------*/
+
+/* Solve the linear systems Ax = b where A = I - gamma*dg/dy.
+   When using a fully implicit method, we are approximating
+   dh/dy as dg/dy. */
+static int SolveReactionLinSys(N_Vector y, N_Vector x, N_Vector b,
+                               realtype gamma, UserData* udata)
+{
+  /* set variable shortcuts */
+  const int dof = udata->grid->dof;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const realtype k2 = udata->k2;
+  const realtype k3 = udata->k3;
+  const realtype k4 = udata->k4;
+  const realtype k6 = udata->k6;
+
+  /* create 4D views of state, RHS and solution vectors */
+  RAJA::View<realtype, RAJA::Layout<4>> Yview(GetVecData(y), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4>> Bview(GetVecData(b), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4>> Xview(GetVecData(x), nxl, nyl, nzl, dof);
+
+  /* solve reaction linear system */
+  auto blocks = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                 RAJA::RangeSegment(0, nyl),
+                                 RAJA::RangeSegment(0, nzl));
+  RAJA::kernel<XYZ_KERNEL_POL>(blocks,
+    [=] DEVICE_FUNC (int i, int j, int k) {
+
+    /* shortcuts to u, v, w for the block */
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+
+    //
+    // compute J = dg/dy
+    //
+
+    /* 1st row: u, v, w */
+    realtype A0 = -k2 * w + 2.0 * k3 * u * v - k4;
+    realtype A1 =  k3 * u * u;
+    realtype A2 = -k2 * u;
+
+    /* 2nd row: u, v, w */
+    realtype A3 =  k2 * w - 2.0 * k3 * u * v;
+    realtype A4 = -k3 * u * u;
+    realtype A5 =  k2 * u;
+
+    /* 3rd row: u, v, w */
+    realtype A6 = -k2 * w;
+    realtype A7 =  0.0;
+    realtype A8 = -k2 * u - k6;
+
+    //
+    // compute A = I - gamma*J
+    //
+
+    A0 = 1. - (gamma * A0);
+    A1 = -gamma * A1;
+    A2 = -gamma * A2;
+    A3 = -gamma * A3;
+    A4 = 1. - (gamma * A4);
+    A5 = -gamma * A5;
+    A6 = -gamma * A6;
+    A7 = -gamma * A7;
+    A8 = 1. - (gamma * A8);
+
+    //
+    // compute x = A^{-1}b
+    //
+
+    realtype scratch_0 = A4*A8;
+    realtype scratch_1 = A1*A5;
+    realtype scratch_2 = A2*A7;
+    realtype scratch_3 = A5*A7;
+    realtype scratch_4 = A1*A8;
+    realtype scratch_5 = A2*A4;
+    realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
+    realtype scratch_7 = A2*A3;
+    realtype scratch_8 = A6*Bview(i,j,k,0);
+    realtype scratch_9 = A2*A6;
+    realtype scratch_10 = A3*Bview(i,j,k,0);
+    realtype scratch_11 = 1.0/A0;
+    realtype scratch_12 = A1*scratch_11;
+    realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
+
+    Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3)
+                               + Bview(i,j,k,1)*(scratch_2 - scratch_4)
+                               + Bview(i,j,k,2)*(scratch_1 - scratch_5));
+    Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5)
+                               + Bview(i,j,k,1)*(A0*A8 - scratch_9)
+                               + A5*scratch_8 - A8*scratch_10 );
+    Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8
+                     + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) /
+                     (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
+
+  });
+
+  return(0);
+}
+
+/* Solve the linear systems Ax = b where A = -dg/dy + gamma.
+   We are approximating dh/dy as dg/dy. */
+static int SolveReactionLinSysRes(N_Vector y, N_Vector x, N_Vector b,
+                                  realtype gamma, UserData* udata)
+{
+  /* set variable shortcuts */
+  const int dof = udata->grid->dof;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const realtype k2  = udata->k2;
+  const realtype k3  = udata->k3;
+  const realtype k4  = udata->k4;
+  const realtype k6  = udata->k6;
+
+  /* create 4D views of state, RHS and solution vectors */
+  RAJA::View<realtype, RAJA::Layout<4>> Yview(GetVecData(y), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4>> Bview(GetVecData(b), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4>> Xview(GetVecData(x), nxl, nyl, nzl, dof);
+
+  /* solve reaction linear system */
+  auto blocks = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                 RAJA::RangeSegment(0, nyl),
+                                 RAJA::RangeSegment(0, nzl));
+  RAJA::kernel<XYZ_KERNEL_POL>(blocks,
+    [=] DEVICE_FUNC (int i, int j, int k) {
+
+    /* shortcuts to u, v, w for the block */
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+
+    //
+    // compute dg/dy
+    //
+
+    /* 1st row: u, v, w */
+    realtype A0 = -k2 * w + 2.0 * k3 * u * v - k4;
+    realtype A1 =  k3 * u * u;
+    realtype A2 = -k2 * u;
+
+    /* 2nd row: u, v, w */
+    realtype A3 =  k2 * w - 2.0 * k3 * u * v;
+    realtype A4 = -k3 * u * u;
+    realtype A5 =  k2 * u;
+
+    /* 3rd row: u, v, w */
+    realtype A6 = -k2 * w;
+    realtype A7 =  0.0;
+    realtype A8 = -k2 * u - k6;
+
+    //
+    // compute A = -dg/dy + gamma*diag(df/dydot)
+    // where diag(df/dydot) is approximated as
+    // diag([udot, vdot, wdot])
+    //
+
+    A0 = -A0 + gamma;
+    A1 = -A1;
+    A2 = -A2;
+    A3 = -A3;
+    A4 = -A4 + gamma;
+    A5 = -A5;
+    A6 = -A6;
+    A7 = -A7;
+    A8 = -A8 + gamma;
+
+    //
+    // compute x = A^{-1}b
+    //
+
+    realtype scratch_0 = A4*A8;
+    realtype scratch_1 = A1*A5;
+    realtype scratch_2 = A2*A7;
+    realtype scratch_3 = A5*A7;
+    realtype scratch_4 = A1*A8;
+    realtype scratch_5 = A2*A4;
+    realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
+    realtype scratch_7 = A2*A3;
+    realtype scratch_8 = A6*Bview(i,j,k,0);
+    realtype scratch_9 = A2*A6;
+    realtype scratch_10 = A3*Bview(i,j,k,0);
+    realtype scratch_11 = 1.0/A0;
+    realtype scratch_12 = A1*scratch_11;
+    realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
+
+    Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3)
+                               + Bview(i,j,k,1)*(scratch_2 - scratch_4)
+                               + Bview(i,j,k,2)*(scratch_1 - scratch_5));
+    Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5)
+                               + Bview(i,j,k,1)*(A0*A8 - scratch_9)
+                               + A5*scratch_8 - A8*scratch_10 );
+    Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8
+                     + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) /
+                     (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
+
+  });
+
+  return(0);
+}
+
+
+/* --------------------------------------------------------------
+ * Preconditioner functions
+ * --------------------------------------------------------------*/
+
+/* Solves Pz = r where P = I - gamma * dg/dy */
+static int PSolve(realtype t, N_Vector y, N_Vector ydot, N_Vector r,
+                  N_Vector z, realtype gamma, realtype delta, int lr,
+                  void *user_data)
+{
+  /* local variables */
+  UserData* udata = (UserData*) user_data;
+  int       retval;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* solve the task-local linear system Pz = r */
+  retval = SolveReactionLinSys(y, z, r, gamma, udata);
+
+  return(retval);
+}
+
+/* Solves Pz = r where P = -dg/dy + gamma */
+static int PSolveRes(realtype t, N_Vector y, N_Vector ydot, N_Vector F,
+                     N_Vector r, N_Vector z, realtype cj, realtype delta,
+                     void *user_data)
+{
+  /* local variables */
+  UserData* udata = (UserData*) user_data;
+  int       retval;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* solve the task-local linear system Pz = r */
+  retval = SolveReactionLinSysRes(y, z, r, cj, udata);
+
+  return(retval);
+}
+
+
+#endif
diff --git a/benchmarks/advection_reaction_3D/rhs3D.hpp b/benchmarks/advection_reaction_3D/rhs3D.hpp
deleted file mode 100644
index 874e5cb8bb..0000000000
--- a/benchmarks/advection_reaction_3D/rhs3D.hpp
+++ /dev/null
@@ -1,700 +0,0 @@
-/* -----------------------------------------------------------------------------
- * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL
- * -----------------------------------------------------------------------------
- * SUNDIALS Copyright Start
- * Copyright (c) 2002-2023, Lawrence Livermore National Security
- * and Southern Methodist University.
- * All rights reserved.
- *
- * See the top-level LICENSE and NOTICE files for details.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- * SUNDIALS Copyright End
- * -----------------------------------------------------------------------------*/
-
-#ifndef ADVECTION_REACTION_3D_RHS_HPP
-#define ADVECTION_REACTION_3D_RHS_HPP
-
-#include "advection_reaction_3D.hpp"
-
-using raja_xyz_tuple = camp::tuple<RAJA::RangeSegment, RAJA::RangeSegment, RAJA::RangeSegment>;
-
-/* --------------------------------------------------------------
- * Right hand side (RHS) and residual functions
- * --------------------------------------------------------------*/
-
-/* Compute the advection term f(t,y) = -c (grad * y). This is done using
-   upwind 1st order finite differences. */
-static int Advection(realtype t, N_Vector y, N_Vector ydot, void* user_data)
-{
-  /* access problem data */
-  UserData* udata = (UserData*) user_data;
-
-  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
-
-  /* set variable shortcuts */
-  const int      nxl = udata->grid->nxl;
-  const int      nyl = udata->grid->nyl;
-  const int      nzl = udata->grid->nzl;
-  const int      dof = udata->grid->dof;
-  const realtype c   = udata->c;
-  const realtype cx  = -c / udata->grid->dx;
-  const realtype cy  = -c / udata->grid->dy;
-  const realtype cz  = -c / udata->grid->dz;
-
-  /* local variables */
-  int retval;
-
-  /* begin exchanging boundary information */
-  if (udata->grid->nprocs() > 1)
-  {
-    retval = ExchangeAllStart(y, udata);
-    if (check_retval(&retval, "ExchangeAllStart", 1, udata->myid))
-      return(-1);
-  }
-
-  /* set output to zero */
-  N_VConst(0.0, ydot);
-
-  /* create views of the data */
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(GetVecData(y),
-                                                     nxl, nyl, nzl, dof);
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > dYview(GetVecData(ydot),
-                                                      nxl, nyl, nzl, dof);
-
-  /* iterate over domain interior, computing advection */
-  if (c > 0.0)
-  {
-    /* flow moving in the positive x,y,z direction */
-    auto range = RAJA::make_tuple(RAJA::RangeSegment(1, nxl),
-                                  RAJA::RangeSegment(1, nyl),
-                                  RAJA::RangeSegment(1, nzl));
-
-    RAJA::kernel<XYZ_KERNEL_POL>(range,
-      [=] DEVICE_FUNC (int i, int j, int k) {
-      const realtype u_ijk = Yview(i,j,k,0);
-      const realtype v_ijk = Yview(i,j,k,1);
-      const realtype w_ijk = Yview(i,j,k,2);
-
-      // grad * u
-      dYview(i,j,k,0) =  cz * (u_ijk - Yview(i,j,k-1,0)); // du/dz
-      dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j-1,k,0)); // du/dy
-      dYview(i,j,k,0) += cx * (u_ijk - Yview(i-1,j,k,0)); // du/dx
-
-      // grad * v
-      dYview(i,j,k,1) =  cz * (v_ijk - Yview(i,j,k-1,1)); // dv/dz
-      dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j-1,k,1)); // dv/dy
-      dYview(i,j,k,1) += cx * (v_ijk - Yview(i-1,j,k,1)); // dv/dx
-
-      // grad * w
-      dYview(i,j,k,2) =  cz * (w_ijk - Yview(i,j,k-1,2)); // dw/dz
-      dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j-1,k,2)); // dw/dy
-      dYview(i,j,k,2) += cx * (w_ijk - Yview(i-1,j,k,2)); // dw/dx
-    });
-  }
-  else if (c < 0.0)
-  {
-    /* flow moving in the negative x,y,z direction */
-    auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1),
-                                  RAJA::RangeSegment(0, nyl-1),
-                                  RAJA::RangeSegment(0, nzl-1));
-    RAJA::kernel<XYZ_KERNEL_POL>(range,
-      [=] DEVICE_FUNC (int i, int j, int k) {
-      const realtype u_ijk = Yview(i,j,k,0);
-      const realtype v_ijk = Yview(i,j,k,1);
-      const realtype w_ijk = Yview(i,j,k,2);
-
-      // grad * u
-      dYview(i,j,k,0) =  cz * (u_ijk - Yview(i,j,k+1,0)); // du/dz
-      dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j+1,k,0)); // du/dy
-      dYview(i,j,k,0) += cx * (u_ijk - Yview(i+1,j,k,0)); // du/dx
-
-      // grad * v
-      dYview(i,j,k,1) =  cz * (v_ijk - Yview(i,j,k+1,1)); // dv/dz
-      dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j+1,k,1)); // dv/dy
-      dYview(i,j,k,1) += cx * (v_ijk - Yview(i+1,j,k,1)); // dv/dx
-
-      // grad * w
-      dYview(i,j,k,2) =  cz * (w_ijk - Yview(i,j,k+1,2)); // dw/dz
-      dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j+1,k,2)); // dw/dy
-      dYview(i,j,k,2) += cx * (w_ijk - Yview(i+1,j,k,2)); // dw/dx
-    });
-  }
-
-  /* finish exchanging boundary information */
-  if (udata->grid->nprocs() > 1)
-  {
-    retval = ExchangeAllEnd(udata);
-    if (check_retval(&retval, "ExchangeAllEnd", 1, udata->myid))
-      return(-1);
-  }
-
-  /* compute advection at process boundaries */
-  if (c > 0.0)
-  {
-    if (udata->grid->npx > 1)
-    {
-      /* Flow moving in the positive x,y,z direction:
-      *  boundaries are west face, south face, front face */
-
-      RAJA::View<realtype, RAJA::Layout<NDIMS> >
-        Yim1jk(udata->grid->getRecvBuffer("WEST"), nyl, nzl, dof); // Wrecv should have data that was sent from East
-
-      auto west_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl),
-                                        RAJA::RangeSegment(0, nzl),
-                                        RAJA::RangeSegment(0, dof));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(west_face,
-        [=] DEVICE_FUNC (int j, int k, int l) {
-        dYview(0,j,k,l) += cx * (Yview(0,j,k,l) - Yim1jk(j,k,l)); // d/dx
-      });
-    }
-    else
-    {
-      auto range = RAJA::make_tuple(RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(range,
-        [=] DEVICE_FUNC (int i, int j, int k) {
-        const realtype u_ijk = Yview(i,j,k,0);
-        const realtype v_ijk = Yview(i,j,k,1);
-        const realtype w_ijk = Yview(i,j,k,2);
-
-        dYview(i,j,k,0) = cx * (u_ijk - Yview(nxl-1,j,k,0)); // du/dx
-        dYview(i,j,k,1) = cx * (v_ijk - Yview(nxl-1,j,k,1)); // dv/dx
-        dYview(i,j,k,2) = cx * (w_ijk - Yview(nxl-1,j,k,2)); // dw/dx
-      });
-
-    }
-
-    if (udata->grid->npy > 1)
-    {
-      RAJA::View<realtype, RAJA::Layout<NDIMS> >
-        Yijm1k(udata->grid->getRecvBuffer("SOUTH"), nxl, nzl, dof); // Nrecv should have data that was sent from North
-
-      auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
-                                         RAJA::RangeSegment(0, nzl),
-                                         RAJA::RangeSegment(0, dof));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(south_face,
-        [=] DEVICE_FUNC (int i, int k, int l) {
-        dYview(i,0,k,l) += cy * (Yview(i,0,k,l) - Yijm1k(i,k,l)); // d/dy
-      });
-    }
-    else
-    {
-      auto range = RAJA::make_tuple(RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(range,
-        [=] DEVICE_FUNC (int i, int j, int k) {
-        const realtype u_ijk = Yview(i,j,k,0);
-        const realtype v_ijk = Yview(i,j,k,1);
-        const realtype w_ijk = Yview(i,j,k,2);
-
-        dYview(i,j,k,0) += cy * (u_ijk - Yview(i,nyl-1,k,0)); // du/dy
-        dYview(i,j,k,1) += cy * (v_ijk - Yview(i,nyl-1,k,1)); // dv/dy
-        dYview(i,j,k,2) += cy * (w_ijk - Yview(i,nyl-1,k,2)); // dw/dy
-      });
-    }
-
-    if (udata->grid->npz > 1)
-    {
-      RAJA::View<realtype, RAJA::Layout<NDIMS> >
-        Yijkm1(udata->grid->getRecvBuffer("FRONT"), nxl, nyl, dof); // Frecv should have data that was sent from Back
-
-      auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
-                                         RAJA::RangeSegment(0, nyl),
-                                         RAJA::RangeSegment(0, dof));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(front_face,
-        [=] DEVICE_FUNC (int i, int j, int l) {
-        dYview(i,j,0,l) += cz * (Yview(i,j,0,l) - Yijkm1(i,j,l)); // d/dz
-      });
-
-    }
-    else
-    {
-      auto range = RAJA::make_tuple(RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(range,
-        [=] DEVICE_FUNC (int i, int j, int k) {
-        const realtype u_ijk = Yview(i,j,k,0);
-        const realtype v_ijk = Yview(i,j,k,1);
-        const realtype w_ijk = Yview(i,j,k,2);
-
-        dYview(i,j,k,0) +=  cz * (u_ijk - Yview(i,j,nzl-1,0)); // du/dz
-        dYview(i,j,k,1) +=  cz * (v_ijk - Yview(i,j,nzl-1,1)); // dv/dz
-        dYview(i,j,k,2) +=  cz * (w_ijk - Yview(i,j,nzl-1,2)); // dw/dz
-      });
-    }
-  }
-  else if (c < 0.0)
-  {
-    if (udata->grid->nprocs() != 1)
-    {
-      /* Flow moving in the negative x,y,z direction:
-      *  boundaries are west face, south face, and front face */
-
-      RAJA::View<realtype, RAJA::Layout<3> >
-        Yip1jk(udata->grid->getRecvBuffer("EAST"), nyl, nzl, dof);
-      RAJA::View<realtype, RAJA::Layout<3> >
-        Yijp1k(udata->grid->getRecvBuffer("NORTH"), nxl, nzl, dof);
-      RAJA::View<realtype, RAJA::Layout<3> >
-        Yijkp1(udata->grid->getRecvBuffer("BACK"), nxl, nyl, dof);
-
-      auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1),
-                                         RAJA::RangeSegment(0, nyl-1),
-                                         RAJA::RangeSegment(0, dof));
-      RAJA::kernel<XYZ_KERNEL_POL>(front_face,
-        [=] DEVICE_FUNC (int i, int j, int l) {
-        dYview(i,j,0,l) =  cz * (Yview(i,j,0,l) - Yijkp1(i,nzl+1,l)); // d/dz
-        dYview(i,j,0,l) += cy * (Yview(i,j,0,l) - Yijp1k(0,j+1,l));   // d/dy
-        dYview(i,j,0,l) += cx * (Yview(i,j,0,l) - Yip1jk(i+1,0,l));   // d/dx
-      });
-
-      auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1),
-                                         RAJA::RangeSegment(0, nzl-1),
-                                         RAJA::RangeSegment(0, dof));
-      RAJA::kernel<XYZ_KERNEL_POL>(south_face,
-        [=] DEVICE_FUNC (int i, int k, int l) {
-        dYview(i,0,k,l) =  cz * (Yview(i,0,k,l) - Yijkp1(i,k+1,l));   // d/dz
-        dYview(i,0,k,l) += cy * (Yview(i,0,k,l) - Yijp1k(0,nyl+1,l)); // d/dy
-        dYview(i,0,k,l) += cx * (Yview(i,0,k,l) - Yip1jk(i+1,0,l));   // d/dx
-      });
-
-      auto east_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl-1),
-                                        RAJA::RangeSegment(0, nzl-1),
-                                        RAJA::RangeSegment(0, dof));
-      RAJA::kernel<XYZ_KERNEL_POL>(east_face,
-        [=] DEVICE_FUNC (int j, int k, int l) {
-        dYview(0,j,k,l) =  cz * (Yview(0,j,k,l) - Yijkp1(0,k+1,l));   // d/dz
-        dYview(0,j,k,l) += cy * (Yview(0,j,k,l) - Yijp1k(0,j+1,l));   // d/dy
-        dYview(0,j,k,l) += cx * (Yview(0,j,k,l) - Yip1jk(nxl+1,0,l)); // d/dx
-      });
-    }
-    else
-    {
-      auto range = RAJA::make_tuple(RAJA::RangeSegment(nxl-2, nxl),
-                                    RAJA::RangeSegment(nyl-2, nyl),
-                                    RAJA::RangeSegment(nzl-2, nzl));
-      RAJA::kernel<XYZ_KERNEL_POL>(range,
-        [=] DEVICE_FUNC (int i, int j, int k) {
-        const realtype u_ijk = Yview(i,j,k,0);
-        const realtype v_ijk = Yview(i,j,k,1);
-        const realtype w_ijk = Yview(i,j,k,2);
-
-        // grad * u
-        dYview(i,j,k,0) =  cz * (u_ijk - Yview(i,j,0,0)); // du/dz
-        dYview(i,j,k,0) += cy * (u_ijk - Yview(i,0,k,0)); // du/dy
-        dYview(i,j,k,0) += cx * (u_ijk - Yview(0,j,k,0)); // du/dx
-
-        // grad * v
-        dYview(i,j,k,1) =  cz * (v_ijk - Yview(i,j,0,1)); // dv/dz
-        dYview(i,j,k,1) += cy * (v_ijk - Yview(i,0,k,1)); // dv/dy
-        dYview(i,j,k,1) += cx * (v_ijk - Yview(0,j,k,1)); // dv/dx
-
-        // grad * w
-        dYview(i,j,k,2) =  cz * (w_ijk - Yview(i,j,0,2)); // dw/dz
-        dYview(i,j,k,2) += cy * (w_ijk - Yview(i,0,k,2)); // dw/dy
-        dYview(i,j,k,2) += cx * (w_ijk - Yview(0,j,k,2)); // dw/dx
-      });
-    }
-  }
-
-  /* return success */
-  return(0);
-}
-
-
-/* Compute the reaction term g(t,y). */
-static int Reaction(realtype t, N_Vector y, N_Vector ydot, void* user_data)
-{
-  /* access problem data */
-  UserData* udata = (UserData*) user_data;
-
-  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
-
-  /* set variable shortcuts */
-  const realtype A  = udata->A;
-  const realtype B  = udata->B;
-  const realtype k1 = udata->k1;
-  const realtype k2 = udata->k2;
-  const realtype k3 = udata->k3;
-  const realtype k4 = udata->k4;
-  const realtype k5 = udata->k5;
-  const realtype k6 = udata->k6;
-
-  /* local variables */
-  realtype* Ydata  = NULL;
-  realtype* dYdata = NULL;
-
-  /* access data arrays */
-  Ydata = GetVecData(y);
-  if (check_retval((void *)Ydata, "GetVecData", 0, udata->myid))
-    return(-1);
-
-  dYdata = GetVecData(ydot);
-  if (check_retval((void *)dYdata, "GetVecData", 0, udata->myid))
-    return(-1);
-
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(GetVecData(y),
-                                                     udata->grid->nxl,
-                                                     udata->grid->nyl,
-                                                     udata->grid->nzl,
-                                                     udata->grid->dof);
-
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > dYview(GetVecData(ydot),
-                                                      udata->grid->nxl,
-                                                      udata->grid->nyl,
-                                                      udata->grid->nzl,
-                                                      udata->grid->dof);
-
-  auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
-                                RAJA::RangeSegment(0, udata->grid->nyl),
-                                RAJA::RangeSegment(0, udata->grid->nzl));
-
-  /* iterate over domain, computing reactions */
-  if (udata->add_reactions)
-  {
-    /* when we are not additively splitting the rhs, we add to ydot
-       as we expect it to hold the advection term already */
-    RAJA::kernel<XYZ_KERNEL_POL>(range,
-      [=] DEVICE_FUNC (int i, int j, int k) {
-      const realtype u = Yview(i,j,k,0);
-      const realtype v = Yview(i,j,k,1);
-      const realtype w = Yview(i,j,k,2);
-      dYview(i,j,k,0) += k1 * A - k2 * w * u + k3 * u * u * v - k4 * u;
-      dYview(i,j,k,1) += k2 * w * u - k3 * u * u * v;
-      dYview(i,j,k,2) += -k2 * w * u + k5 * B - k6 * w;
-    });
-  }
-  else
-  {
-    /* set output to zero */
-    N_VConst(0.0, ydot);
-
-    RAJA::kernel<XYZ_KERNEL_POL>(range,
-      [=] DEVICE_FUNC (int i, int j, int k) {
-      const realtype u = Yview(i,j,k,0);
-      const realtype v = Yview(i,j,k,1);
-      const realtype w = Yview(i,j,k,2);
-      dYview(i,j,k,0) = k1 * A - k2 * w * u + k3 * u * u * v - k4 * u;
-      dYview(i,j,k,1) = k2 * w * u - k3 * u * u * v;
-      dYview(i,j,k,2) = -k2 * w * u + k5 * B - k6 * w;
-    });
-  }
-
-  /* return success */
-    return(0);
-}
-
-
-/* Compute the RHS as h(t,y) = f(t,y) + g(t,y). */
-static int AdvectionReaction(realtype t, N_Vector y, N_Vector ydot,
-                             void *user_data)
-{
-  /* access problem data */
-  UserData* udata = (UserData*) user_data;
-  int retval;
-
-  /* NOTE: The order in which Advection and Reaction are
-           called is critical here. Advection must be
-           computed first. */
-  retval = Advection(t, y, ydot, user_data);
-  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
-
-  retval = Reaction(t, y, ydot, user_data);
-  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
-
-  /* return success */
-  return(0);
-}
-
-/* Compute the residual F(t,y,y') = ydot - h(t,y) = 0. */
-static int AdvectionReactionResidual(realtype t, N_Vector y, N_Vector ydot,
-                                     N_Vector F, void *user_data)
-{
-  /* access problem data */
-  UserData* udata = (UserData*) user_data;
-  int retval;
-
-  /* NOTE: The order in which Advection and Reaction are
-           called is critical here. Advection must be
-           computed first. */
-  retval = Advection(t, y, F, user_data); /* F = -c y_x */
-  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
-
-  retval = Reaction(t, y, F, user_data);  /* F = -c y_x + g(t,y) */
-  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
-
-  /* F = ydot - h(t,y) = ydot + c y_x - g(t,y) */
-  N_VLinearSum(1.0, ydot, -1.0, F, F);
-
-  /* return success */
-  return(0);
-}
-
-/* --------------------------------------------------------------
- * Linear system and Jacobian functions
- * --------------------------------------------------------------*/
-
-/* Solve the linear systems Ax = b where A = I - gamma*dg/dy.
-   When using a fully implicit method, we are approximating
-   dh/dy as dg/dy. */
-static int SolveReactionLinSys(N_Vector y, N_Vector x, N_Vector b,
-                               realtype gamma, raja_xyz_tuple blocks,
-                               UserData* udata)
-{
-  /* shortcuts */
-  int       dof, nxl, nyl, nzl;
-  realtype  k2, k3, k4, k6;
-
-  /* set shortcuts */
-  dof = udata->grid->dof;
-  nxl = udata->grid->nxl;
-  nyl = udata->grid->nyl;
-  nzl = udata->grid->nzl;
-  k2  = udata->k2;
-  k3  = udata->k3;
-  k4  = udata->k4;
-  k6  = udata->k6;
-  
-  /* create views of the data */
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(GetVecData(y),
-                                                     nxl, nyl, nzl, dof);
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Bview(GetVecData(b),
-                                                     nxl, nyl, nzl, dof);
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Xview(GetVecData(x),
-                                                     nxl, nyl, nzl, dof);
-
-  RAJA::kernel<XYZ_KERNEL_POL>(blocks,
-    [=] DEVICE_FUNC (int i, int j, int k) {
-
-    /* and the corresponding vectors */
-    realtype *b = &(Bview(i,j,k,0));
-    realtype *x = &(Xview(i,j,k,0));
-
-    /* shortcuts to u, v, w for the block */
-    realtype u = Yview(i,j,k,0);
-    realtype v = Yview(i,j,k,1);
-    realtype w = Yview(i,j,k,2);
-
-    realtype A0, A1, A2, A3, A4, A5, A6, A7, A8;
-
-    //
-    // compute J = dg/dy
-    //
-
-    /* 1st row: u, v, w */
-    A0 = -k2 * w + 2.0 * k3 * u * v - k4;
-    A1 =  k3 * u * u;
-    A2 = -k2 * u;
-
-    /* 2nd row: u, v, w */
-    A3 =  k2 * w - 2.0 * k3 * u * v;
-    A4 = -k3 * u * u;
-    A5 =  k2 * u;
-
-    /* 3rd row: u, v, w */
-    A6 = -k2 * w;
-    A7 =  0.0;
-    A8 = -k2 * u - k6;
-
-    //
-    // compute A = I - gamma*J
-    //
-
-    A0 = 1. - (gamma * A0);
-    A1 = -gamma * A1;
-    A2 = -gamma * A2;
-    A3 = -gamma * A3;
-    A4 = 1. - (gamma * A4);
-    A5 = -gamma * A5;
-    A6 = -gamma * A6;
-    A7 = -gamma * A7;
-    A8 = 1. - (gamma * A8);
-
-    //
-    // compute x = A^{-1}b
-    //
-
-    realtype scratch_0 = A4*A8;
-    realtype scratch_1 = A1*A5;
-    realtype scratch_2 = A2*A7;
-    realtype scratch_3 = A5*A7;
-    realtype scratch_4 = A1*A8;
-    realtype scratch_5 = A2*A4;
-    realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
-    realtype scratch_7 = A2*A3;
-    realtype scratch_8 = A6*b[0];
-    realtype scratch_9 = A2*A6;
-    realtype scratch_10 = A3*b[0];
-    realtype scratch_11 = 1.0/A0;
-    realtype scratch_12 = A1*scratch_11;
-    realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
-
-    x[0] = scratch_6*(b[0]*scratch_0 - b[0]*scratch_3 + b[1]*scratch_2 - b[1]*scratch_4 + b[2]*scratch_1 - b[2]*scratch_5);
-    x[1] = scratch_6*(-A0*A5*b[2] + A0*A8*b[1] + A5*scratch_8 - A8*scratch_10 - b[1]*scratch_9 + b[2]*scratch_7);
-    x[2] = (-b[2] + scratch_11*scratch_8 + scratch_13*(b[1] - scratch_10*scratch_11))/(-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
-  });
-
-  return(0);
-}
-
-/* Solve the linear systems Ax = b where A = -dg/dy + gamma.
-   We are approximating dh/dy as dg/dy. */
-static int SolveReactionLinSysRes(N_Vector y, N_Vector x, N_Vector b,
-                                  realtype gamma, raja_xyz_tuple blocks,
-                                  UserData* udata)
-{
-  /* shortcuts */
-  int       dof, nxl, nyl, nzl;
-  realtype  k2, k3, k4, k6;
-
-  /* set shortcuts */
-  dof = udata->grid->dof;
-  nxl = udata->grid->nxl;
-  nyl = udata->grid->nyl;
-  nzl = udata->grid->nzl;
-  k2    = udata->k2;
-  k3    = udata->k3;
-  k4    = udata->k4;
-  k6    = udata->k6;
-
-  /* create views of the data */
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(GetVecData(y),
-                                                     nxl, nyl, nzl, dof);
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Bview(GetVecData(b),
-                                                     nxl, nyl, nzl, dof);
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Xview(GetVecData(x),
-                                                     nxl, nyl, nzl, dof);
-
-  RAJA::kernel<XYZ_KERNEL_POL>(blocks,
-    [=] DEVICE_FUNC (int i, int j, int k) {
-
-    /* and the corresponding vectors */
-    realtype *b = &(Bview(i,j,k,0));
-    realtype *x = &(Xview(i,j,k,0));
-
-    /* shortcuts to u, v, w for the block */
-    realtype u = Yview(i,j,k,0);
-    realtype v = Yview(i,j,k,1);
-    realtype w = Yview(i,j,k,2);
-
-    realtype A0, A1, A2, A3, A4, A5, A6, A7, A8;
-
-    //
-    // compute dg/dy
-    //
-
-    /* 1st row: u, v, w */
-    A0 = -k2 * w + 2.0 * k3 * u * v - k4;
-    A1 =  k3 * u * u;
-    A2 = -k2 * u;
-
-    /* 2nd row: u, v, w */
-    A3 =  k2 * w - 2.0 * k3 * u * v;
-    A4 = -k3 * u * u;
-    A5 =  k2 * u;
-
-    /* 3rd row: u, v, w */
-    A6 = -k2 * w;
-    A7 =  0.0;
-    A8 = -k2 * u - k6;
-
-    //
-    // compute A = -dg/dy + gamma*diag(df/dydot)
-    // where diag(df/dydot) is approximated as
-    // diag([udot, vdot, wdot])
-    //
-
-    A0 = -A0 + gamma;
-    A1 = -A1;
-    A2 = -A2;
-    A3 = -A3;
-    A4 = -A4 + gamma;
-    A5 = -A5;
-    A6 = -A6;
-    A7 = -A7;
-    A8 = -A8 + gamma;
-
-    //
-    // compute x = A^{-1}b
-    //
-
-    realtype scratch_0 = A4*A8;
-    realtype scratch_1 = A1*A5;
-    realtype scratch_2 = A2*A7;
-    realtype scratch_3 = A5*A7;
-    realtype scratch_4 = A1*A8;
-    realtype scratch_5 = A2*A4;
-    realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
-    realtype scratch_7 = A2*A3;
-    realtype scratch_8 = A6*b[0];
-    realtype scratch_9 = A2*A6;
-    realtype scratch_10 = A3*b[0];
-    realtype scratch_11 = 1.0/A0;
-    realtype scratch_12 = A1*scratch_11;
-    realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
-
-    x[0] = scratch_6*(b[0]*scratch_0 - b[0]*scratch_3 + b[1]*scratch_2 - b[1]*scratch_4 + b[2]*scratch_1 - b[2]*scratch_5);
-    x[1] = scratch_6*(-A0*A5*b[2] + A0*A8*b[1] + A5*scratch_8 - A8*scratch_10 - b[1]*scratch_9 + b[2]*scratch_7);
-    x[2] = (-b[2] + scratch_11*scratch_8 + scratch_13*(b[1] - scratch_10*scratch_11))/(-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
-  });
-
-  return(0);
-}
-
-
-/* --------------------------------------------------------------
- * Preconditioner functions
- * --------------------------------------------------------------*/
-
-/* Solves Pz = r where P = I - gamma * dg/dy */
-static int PSolve(realtype t, N_Vector y, N_Vector ydot, N_Vector r,
-                  N_Vector z, realtype gamma, realtype delta, int lr,
-
-                  void *user_data)
-{
-  /* local variables */
-  UserData* udata = (UserData*) user_data;
-  int       retval;
-
-  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
-
-  /* solve the task-local linear system Pz = r */
-  auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
-                                RAJA::RangeSegment(0, udata->grid->nyl),
-                                RAJA::RangeSegment(0, udata->grid->nzl));
-  retval = SolveReactionLinSys(y, z, r, gamma, range, udata);
-
-  return(retval);
-}
-
-/* Solves Pz = r where P = -dg/dy + gamma */
-static int PSolveRes(realtype t, N_Vector y, N_Vector ydot, N_Vector F,
-                     N_Vector r, N_Vector z, realtype cj, realtype delta,
-                     void *user_data)
-{
-  /* local variables */
-  UserData* udata = (UserData*) user_data;
-  int       retval;
-
-  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
-
-  /* solve the task-local linear system Pz = r */
-  auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
-                                RAJA::RangeSegment(0, udata->grid->nyl),
-                                RAJA::RangeSegment(0, udata->grid->nzl));
-  retval = SolveReactionLinSysRes(y, z, r, cj, range, udata);
-
-  return(retval);
-}
-
-
-#endif
diff --git a/benchmarks/advection_reaction_3D/scripts/make_plots.py b/benchmarks/advection_reaction_3D/scripts/make_plots.py
new file mode 100755
index 0000000000..7728562510
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/scripts/make_plots.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python
+# ------------------------------------------------------------------------------
+# Programmer(s):  Daniel R. Reynolds @ SMU
+# ------------------------------------------------------------------------------
+# SUNDIALS Copyright Start
+# Copyright (c) 2002-2023, Lawrence Livermore National Security
+# and Southern Methodist University.
+# All rights reserved.
+#
+# See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# SUNDIALS Copyright End
+# ------------------------------------------------------------------------------
+# matplotlib-based plotting script for the advection_reaction_3D benchmark codes
+# ------------------------------------------------------------------------------
+
+# imports
+from os.path import exists
+import numpy as np
+import matplotlib.pyplot as plt
+
+# ------------------------------------------------------------------------------
+
+# utility functions
+def parallel_coords(rank):
+    if (rank == 0):
+        return [0, 0, 0]
+    if (rank == 1):
+        return [0, 0, 1]
+    if (rank == 2):
+        return [0, 1, 0]
+    if (rank == 3):
+        return [0, 1, 1]
+    if (rank == 4):
+        return [1, 0, 0]
+    if (rank == 5):
+        return [1, 0, 1]
+    if (rank == 6):
+        return [1, 1, 0]
+    if (rank == 7):
+        return [1, 1, 1]
+
+def xslice(u,it,ix):
+    return u[it,ix,:,:]
+
+def yslice(u,it,iy):
+    return u[it,:,iy,:]
+
+def zslice(u,it,iz):
+    return u[it,:,:,iz]
+
+def xproj(u,it):
+    return np.average(u[it,:,:,:], axis=0)
+
+def yproj(u,it):
+    return np.average(u[it,:,:,:], axis=1)
+
+def zproj(u,it):
+    return np.average(u[it,:,:,:], axis=2)
+
+def myplot(axis, X, Y, Z, xlabel='none', ylabel='none'):
+    frame = axis.contourf(X, Y, Z)
+    plt.colorbar(frame, ax=axis)
+    if (xlabel != 'none'):
+        axis.set_xlabel(xlabel)
+    if (ylabel != 'none'):
+        axis.set_ylabel(ylabel)
+
+
+
+# read time mesh
+times = np.loadtxt("t.000000.txt")
+nt = times.size
+
+# read spatial mesh
+mesh = np.loadtxt("mesh.txt", dtype=float)
+x = mesh[0,:]
+y = mesh[1,:]
+z = mesh[2,:]
+nx = x.size
+ny = y.size
+nz = z.size
+
+# ensure that the run used exactly 1 or 8 MPI ranks
+for i in range(9):
+    if (exists("u.00000" + str(i) + ".txt" ) and
+        not exists("u.00000" + str(i+1) + ".txt" )):
+        nprocs = i+1
+if ((nprocs != 1) and (nprocs != 8)):
+    print("make_plots.py error: run must have used either 1 or 8 MPI ranks")
+    exit()
+
+# load data for run
+if (nprocs == 1):
+    u = np.zeros((nt,nx,ny,nz), dtype=float)
+    v = np.zeros((nt,nx,ny,nz), dtype=float)
+    w = np.zeros((nt,nx,ny,nz), dtype=float)
+    udata = np.loadtxt("u.000000.txt")
+    vdata = np.loadtxt("v.000000.txt")
+    wdata = np.loadtxt("w.000000.txt")
+    if (nt != udata.shape[0]):
+        print("make_plots.py error: mesh and data have incompatible sizes")
+        exit()
+    if (nx*ny*nz != udata.shape[1]):
+        print("make_plots.py error: mesh and data have incompatible sizes")
+        exit()
+    for it in range(nt):
+        u[it,:,:,:] = np.reshape(udata[it,:], (nx,ny,nz), order='C')
+        v[it,:,:,:] = np.reshape(vdata[it,:], (nx,ny,nz), order='C')
+        w[it,:,:,:] = np.reshape(wdata[it,:], (nx,ny,nz), order='C')
+else:
+    u = np.zeros((nt,nx,ny,nz), dtype=float)
+    v = np.zeros((nt,nx,ny,nz), dtype=float)
+    w = np.zeros((nt,nx,ny,nz), dtype=float)
+    nxl = nx//2
+    nyl = ny//2
+    nzl = nz//2
+    for ip in range(8):
+        udata = np.loadtxt("u.00000" + str(ip) + ".txt")
+        vdata = np.loadtxt("v.00000" + str(ip) + ".txt")
+        wdata = np.loadtxt("w.00000" + str(ip) + ".txt")
+        if (nt != udata.shape[0]):
+            print("make_plots.py error: mesh and data have incompatible sizes")
+            exit()
+        if (nxl*nyl*nzl != udata.shape[1]):
+            print("make_plots.py error: mesh and data have incompatible sizes")
+            exit()
+        coords = parallel_coords(ip)
+        ilo = coords[0]*nxl
+        ihi = (coords[0]+1)*nxl
+        jlo = coords[1]*nyl
+        jhi = (coords[1]+1)*nyl
+        klo = coords[2]*nzl
+        khi = (coords[2]+1)*nzl
+        for it in range(nt):
+            u[it,ilo:ihi,jlo:jhi,klo:khi] = np.reshape(udata[it,:], (nxl,nyl,nzl), order='C')
+            v[it,ilo:ihi,jlo:jhi,klo:khi] = np.reshape(vdata[it,:], (nxl,nyl,nzl), order='C')
+            w[it,ilo:ihi,jlo:jhi,klo:khi] = np.reshape(wdata[it,:], (nxl,nyl,nzl), order='C')
+
+
+# set meshgrid objects
+xy0,xy1 = np.meshgrid(x, y)
+yz0,yz1 = np.meshgrid(y, z)
+xz0,xz1 = np.meshgrid(x, z)
+
+# generate plots
+sliceidx = 25
+tslice = [0, 5, 10]
+figsize = (9,7)
+
+#    xy slices at various times
+plt.figure(1)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, xy0, xy1, zslice(u,tslice[0],sliceidx), ylabel = 'u')
+myplot(ax2, xy0, xy1, zslice(u,tslice[1],sliceidx))
+myplot(ax3, xy0, xy1, zslice(u,tslice[2],sliceidx))
+myplot(ax4, xy0, xy1, zslice(v,tslice[0],sliceidx), ylabel = 'v')
+myplot(ax5, xy0, xy1, zslice(v,tslice[1],sliceidx))
+myplot(ax6, xy0, xy1, zslice(v,tslice[2],sliceidx))
+myplot(ax7, xy0, xy1, zslice(w,tslice[0],sliceidx), ylabel = 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, xy0, xy1, zslice(w,tslice[1],sliceidx), xlabel = 't = ' + str(times[1]))
+myplot(ax9, xy0, xy1, zslice(w,tslice[2],sliceidx), xlabel = 't = ' + str(times[2]))
+plt.savefig('xy-slices.png')
+
+#    yz slices at various times
+plt.figure(2)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, yz0, yz1, xslice(u,tslice[0],sliceidx), ylabel = 'u')
+myplot(ax2, yz0, yz1, xslice(u,tslice[1],sliceidx))
+myplot(ax3, yz0, yz1, xslice(u,tslice[2],sliceidx))
+myplot(ax4, yz0, yz1, xslice(v,tslice[0],sliceidx), ylabel = 'v')
+myplot(ax5, yz0, yz1, xslice(v,tslice[1],sliceidx))
+myplot(ax6, yz0, yz1, xslice(v,tslice[2],sliceidx))
+myplot(ax7, yz0, yz1, xslice(w,tslice[0],sliceidx), ylabel = 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, yz0, yz1, xslice(w,tslice[1],sliceidx), xlabel = 't = ' + str(times[1]))
+myplot(ax9, yz0, yz1, xslice(w,tslice[2],sliceidx), xlabel = 't = ' + str(times[2]))
+plt.savefig('yz-slices.png')
+
+#    xz slices at various times
+plt.figure(3)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, xz0, xz1, yslice(u,tslice[0],sliceidx), ylabel ='u')
+myplot(ax2, xz0, xz1, yslice(u,tslice[1],sliceidx))
+myplot(ax3, xz0, xz1, yslice(u,tslice[2],sliceidx))
+myplot(ax4, xz0, xz1, yslice(v,tslice[0],sliceidx), ylabel = 'v')
+myplot(ax5, xz0, xz1, yslice(v,tslice[1],sliceidx))
+myplot(ax6, xz0, xz1, yslice(v,tslice[2],sliceidx))
+myplot(ax7, xz0, xz1, yslice(w,tslice[0],sliceidx), ylabel= 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, xz0, xz1, yslice(w,tslice[1],sliceidx), xlabel ='t = ' + str(times[1]))
+myplot(ax9, xz0, xz1, yslice(w,tslice[2],sliceidx), xlabel = 't = ' + str(times[2]))
+plt.savefig('xz-slices.png')
+
+#    xy projection at various times
+plt.figure(4)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, xy0, xy1, zproj(u,tslice[0]), ylabel = 'u')
+myplot(ax2, xy0, xy1, zproj(u,tslice[1]))
+myplot(ax3, xy0, xy1, zproj(u,tslice[2]))
+myplot(ax4, xy0, xy1, zproj(v,tslice[0]), ylabel = 'v')
+myplot(ax5, xy0, xy1, zproj(v,tslice[1]))
+myplot(ax6, xy0, xy1, zproj(v,tslice[2]))
+myplot(ax7, xy0, xy1, zproj(w,tslice[0]), ylabel = 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, xy0, xy1, zproj(w,tslice[1]), xlabel = 't = ' + str(times[1]))
+myplot(ax9, xy0, xy1, zproj(w,tslice[2]), xlabel = 't = ' + str(times[2]))
+plt.savefig('xy-projections.png')
+
+#    yz projection at various times
+fig = plt.figure(5)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, yz0, yz1, xproj(u,tslice[0]), ylabel = 'u')
+myplot(ax2, yz0, yz1, xproj(u,tslice[1]))
+myplot(ax3, yz0, yz1, xproj(u,tslice[2]))
+myplot(ax4, yz0, yz1, xproj(v,tslice[0]), ylabel = 'v')
+myplot(ax5, yz0, yz1, xproj(v,tslice[1]))
+myplot(ax6, yz0, yz1, xproj(v,tslice[2]))
+myplot(ax7, yz0, yz1, xproj(w,tslice[0]), ylabel = 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, yz0, yz1, xproj(w,tslice[1]), xlabel = 't = ' + str(times[1]))
+myplot(ax9, yz0, yz1, xproj(w,tslice[2]), xlabel = 't = ' + str(times[2]))
+plt.savefig('yz-projections.png')
+
+#    xz projection at various times
+fig = plt.figure(6)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, xz0, xz1, yproj(u,tslice[0]), ylabel = 'u')
+myplot(ax2, xz0, xz1, yproj(u,tslice[1]))
+myplot(ax3, xz0, xz1, yproj(u,tslice[2]))
+myplot(ax4, xz0, xz1, yproj(v,tslice[0]), ylabel = 'v')
+myplot(ax5, xz0, xz1, yproj(v,tslice[1]))
+myplot(ax6, xz0, xz1, yproj(v,tslice[2]))
+myplot(ax7, xz0, xz1, yproj(w,tslice[0]), ylabel = 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, xz0, xz1, yproj(w,tslice[1]), xlabel = 't = ' + str(times[1]))
+myplot(ax9, xz0, xz1, yproj(w,tslice[2]), xlabel = 't = ' + str(times[2]))
+plt.savefig('xz-projections.png')
+
+#plt.show()
+plt.close()
+
+##### end of script #####
diff --git a/benchmarks/diffusion_2D/README.md b/benchmarks/diffusion_2D/README.md
index d60869a8d8..453879953f 100644
--- a/benchmarks/diffusion_2D/README.md
+++ b/benchmarks/diffusion_2D/README.md
@@ -8,24 +8,26 @@ required.
 
 This code simulates the anisotropic 2D heat equation,
 
-$$\frac{\partial u}{\partial t} = k_x \frac{\partial^2 u}{\partial x^2} + k_y \frac{\partial^2 u}{\partial y^2} + b,$$
+$$\frac{\partial u}{\partial t} = \nabla \cdot (D \nabla u) + b(t, \mathbf{x})$$
 
-where $k_x$ and $k_y$ are the diffusion coefficients. The system is evolved for
-$t$ in $[0, t_f]$ and $(x,y) = X$ in $[0, X_{max}]^2$ with the initial condition
+where $D$ is a diagonal matrix with entries $k_x$ and $k_y$. The system is
+evolved for $t \in [0, t_f]$ on the rectangular domain
+$(x,y) \equiv \mathbf{x} \in [\mathbf{0}, \mathbf{x}_{\text{max}}]^2$, with the
+initial condition
 
-$$u(0,X) = \sin^2(\pi x) \sin^2(\pi y),$$
+$$u(0,\mathbf{x}) = \sin^2(\pi x) \sin^2(\pi y),$$
 
 and stationary boundary conditions
 
-$$\frac{\partial u}{\partial t}(t,0,y) = \frac{\partial u}{\partial t}(t,x_{max},y) = \frac{\partial u}{\partial t}(t,x,0) = \frac{\partial u}{\partial t}(t,x,y_{max}) = 0.$$
+$$\frac{\partial u}{\partial t}(t,0,y) = \frac{\partial u}{\partial t}(t,x_{\text{max}},y) = \frac{\partial u}{\partial t}(t,x,0) = \frac{\partial u}{\partial t}(t,x,y_{\text{max}}) = 0.$$
 
 The source term is given by
 
-$$b(t,X) = -2 \pi \sin^2(\pi x) \sin^2(\pi y) \sin(\pi t) \cos(\pi t) - k_x 2 \pi^2 (\cos^2(\pi x) - \sin^2(\pi x)) \sin^2(\pi y) \cos^2(\pi t) - k_y 2 \pi^2 (\cos^2(\pi y) - \sin^2(\pi y)) \sin^2(\pi x) \cos^2(\pi t).$$
+$$b(t,\mathbf{x}) = -2 \pi \sin^2(\pi x) \sin^2(\pi y) \sin(\pi t) \cos(\pi t) - k_x 2 \pi^2 (\cos^2(\pi x) - \sin^2(\pi x)) \sin^2(\pi y) \cos^2(\pi t) - k_y 2 \pi^2 (\cos^2(\pi y) - \sin^2(\pi y)) \sin^2(\pi x) \cos^2(\pi t).$$
 
 Under this setup, the problem has the analytical solution
 
-$$u(t,X) = \sin^2(\pi x) \sin^2(\pi y) \cos^2(\pi t).$$
+$$u(t,\mathbf{x}) = \sin^2(\pi x) \sin^2(\pi y) \cos^2(\pi t).$$
 
 Spatial derivatives are computed using second-order centered differences on a
 uniform spatial grid. The problem can be evolved in time with ARKODE, CVODE, or
@@ -33,11 +35,12 @@ IDA. With ARKODE, an adaptive step diagonally implicit Runge-Kutta (DIRK) method
 is applied. When using CVODE or IDA, adaptive order and step BDF methods are
 used.
 
-In all cases, the nonlinear system(s) in each time step are solved using an
-inexact Newton method paired with a matrix-free PCG or GMRES linear solver and a
-Jacobi preconditioner. If SUNDIALS is built with the SuperLU_DIST interface enabled
-a modified Newton method with SuperLU_DIST as the direct linear solver may also be
-selected at run time.
+By default, the nonlinear system(s) in each time step are solved using an
+inexact Newton method paired with a matrix-free CG linear solver and a Jacobi
+preconditioner. A matrix-free GMRES linear solver may be selected at run time.
+If SUNDIALS is built with the SuperLU_DIST interface enabled a modified Newton
+method with SuperLU_DIST as the direct linear solver may also be selected at run
+time.
 
 ## Options
 
@@ -53,10 +56,10 @@ listed below.
 | `--npy <int>`                        | Number of MPI tasks in the y-direction (0 forces MPI to decide)                          | 0       |
 | `--nx <int>`                         | Number of mesh points in the x-direction                                                 | 32      |
 | `--ny <int>`                         | Number of mesh points in the y-direction                                                 | 32      |
-| `--ux <realtype>`                    | The domain upper bound in the x-direction `x_max`                                        | 1.0     |
-| `--uy <realtype>`                    | The domain upper bound in the y-direction `y_max`                                        | 1.0     |
-| `--kx <realtype>`                    | Diffusion coefficient in the x-direction `kx`                                            | 1.0     |
-| `--ky <realtype>`                    | Diffusion coefficient in the y-direction `ky`                                            | 1.0     |
+| `--xu <realtype>`                    | The domain upper bound in the x-direction $x_{\text{max}}$                               | 1.0     |
+| `--yu <realtype>`                    | The domain upper bound in the y-direction $y_{\text{max}}$                               | 1.0     |
+| `--kx <realtype>`                    | Diffusion coefficient in the x-direction $k_x$                                           | 1.0     |
+| `--ky <realtype>`                    | Diffusion coefficient in the y-direction $k_y$                                           | 1.0     |
 | `--tf <realtype>`                    | The final time `tf`                                                                      | 1.0     |
 | `--noforcing`                        | Disable the forcing term                                                                 | Enabled |
 | Output Options                       |                                                                                          |         |
@@ -94,9 +97,11 @@ Based on the configuration, executables for each integrator and backend option
 are built and installed in `<BENCHMARKS_INSTALL_PATH>/diffusion_2D`. The
 executables follow the naming convention `<package>_diffusion_2D_<parallelism>`
 where `<package>` is `arkode`, `cvode`, or `ida` and `<parallelism>` is `mpi` for
-MPI only parallelism, `mpicuda` for MPI + CUDA, and `mpihip` for MPI + HIP. Note
-when using the SuperLU_DIST linear solver computations will be offloaded to the
-GPU in the MPI only executables if CUDA or ROCM support is enabled in SuperLU_DIST.
+MPI only parallelism, `mpicuda` for MPI + CUDA, and `mpihip` for MPI + HIP.
+
+**Note:** When using the SuperLU_DIST linear solver computations will be
+offloaded to the GPU in the MPI only executables if CUDA or ROCM support is
+enabled in SuperLU_DIST.
 
 On Summit, with the default environment
 ```
diff --git a/cmake/SundialsBuildOptionsPre.cmake b/cmake/SundialsBuildOptionsPre.cmake
index b1532d2ce3..a28b3485ea 100644
--- a/cmake/SundialsBuildOptionsPre.cmake
+++ b/cmake/SundialsBuildOptionsPre.cmake
@@ -294,3 +294,5 @@ sundials_option(SUNDIALS_TEST_DEVTESTS BOOL
 # Include unit tests in regression tests
 sundials_option(SUNDIALS_TEST_UNITTESTS BOOL
   "Include unit tests in make test" OFF ADVANCED)
+
+sundials_option(SUNDIALS_TEST_MPIRUN_COMMAND STRING "Job scheduler or mpirun command used to launch SUNDIALS MPI tests." "" ADVANCED)
diff --git a/cmake/SundialsTPLOptions.cmake b/cmake/SundialsTPLOptions.cmake
index f01a0ac14d..11e39d0f99 100644
--- a/cmake/SundialsTPLOptions.cmake
+++ b/cmake/SundialsTPLOptions.cmake
@@ -61,6 +61,11 @@ sundials_option(ENABLE_HIP BOOL "Enable HIP support" OFF)
 # -------------------------------------------------------------
 sundials_option(ENABLE_SYCL BOOL "Enable SYCL support" OFF)
 
+sundials_option(SUNDIALS_SYCL_2020_UNSUPPORTED BOOL
+                "Disable the use of some SYCL 2020 features in SUNDIALS libraries and examples" OFF
+                DEPENDS_ON ENABLE_SYCL
+                ADVANCED)
+
 # ---------------------------------------------------------------
 # Enable LAPACK support?
 # ---------------------------------------------------------------
@@ -288,6 +293,16 @@ sundials_option(ONEMKL_WORKS BOOL "Set to ON to force CMake to accept a given on
                 DEPENDS_ON ENABLE_ONEMKL
                 ADVANCED)
 
+sundials_option(SUNDIALS_ONEMKL_USE_GETRF_LOOP BOOL
+                "Replace batched getrf call with loop over getrf" OFF
+                DEPENDS_ON ENABLE_ONEMKL
+                ADVANCED)
+
+sundials_option(SUNDIALS_ONEMKL_USE_GETRS_LOOP BOOL
+                "Replace batched getrs call with loop over getrs" OFF
+                DEPENDS_ON ENABLE_ONEMKL
+                ADVANCED)
+
 # ---------------------------------------------------------------
 # Enable Caliper support?
 # ---------------------------------------------------------------
diff --git a/cmake/macros/SundialsAddTest.cmake b/cmake/macros/SundialsAddTest.cmake
index 45bf3d8ebd..b93027a017 100644
--- a/cmake/macros/SundialsAddTest.cmake
+++ b/cmake/macros/SundialsAddTest.cmake
@@ -135,8 +135,13 @@ macro(SUNDIALS_ADD_TEST NAME EXECUTABLE)
       endif()
 
       # check if this test is run with MPI and set the MPI run command
-      if((SUNDIALS_ADD_TEST_MPI_NPROCS) AND (MPIEXEC_EXECUTABLE))
-        set(RUN_COMMAND "${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${MPIEXEC_PREFLAGS}")
+      if((SUNDIALS_ADD_TEST_MPI_NPROCS) AND ((MPIEXEC_EXECUTABLE) OR (SUNDIALS_TEST_MPIRUN_COMMAND)))
+        if (SUNDIALS_TEST_MPIRUN_COMMAND)
+          set(RUN_COMMAND "${SUNDIALS_TEST_MPIRUN_COMMAND} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${MPIEXEC_PREFLAGS}")
+        elseif(MPIEXEC_EXECUTABLE)
+          set(RUN_COMMAND "${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${MPIEXEC_PREFLAGS}")
+        endif()
+
         # remove trailing white space (empty MPIEXEC_PREFLAGS) as it can cause
         # erroneous test failures with some MPI implementations
         string(STRIP "${RUN_COMMAND}" RUN_COMMAND)
@@ -174,11 +179,16 @@ macro(SUNDIALS_ADD_TEST NAME EXECUTABLE)
       endif()
 
       # check if this test is run with MPI and add the test run command
-      if((SUNDIALS_ADD_TEST_MPI_NPROCS) AND (MPIEXEC_EXECUTABLE))
+      if((SUNDIALS_ADD_TEST_MPI_NPROCS) AND ((MPIEXEC_EXECUTABLE) OR (SUNDIALS_TEST_MPIRUN_COMMAND)))
         if(MPIEXEC_PREFLAGS)
           string(REPLACE " " ";" PREFLAGS "${MPIEXEC_PREFLAGS}")
         endif()
-        add_test(NAME ${NAME} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${PREFLAGS} $<TARGET_FILE:${EXECUTABLE}> ${TEST_ARGS})
+        if (SUNDIALS_TEST_MPIRUN_COMMAND)
+          string(REPLACE " " ";" MPI_EXEC_ARGS "${SUNDIALS_TEST_MPIRUN_COMMAND}")
+          add_test(NAME ${NAME} COMMAND ${MPI_EXEC_ARGS} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${PREFLAGS} $<TARGET_FILE:${EXECUTABLE}> ${TEST_ARGS})
+        else()
+          add_test(NAME ${NAME} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${PREFLAGS} $<TARGET_FILE:${EXECUTABLE}> ${TEST_ARGS})
+        endif()
       else()
         add_test(NAME ${NAME} COMMAND $<TARGET_FILE:${EXECUTABLE}> ${TEST_ARGS})
       endif()
diff --git a/cmake/tpl/SundialsONEMKL.cmake b/cmake/tpl/SundialsONEMKL.cmake
index 34177ff0fe..a807a2e7f4 100644
--- a/cmake/tpl/SundialsONEMKL.cmake
+++ b/cmake/tpl/SundialsONEMKL.cmake
@@ -64,6 +64,7 @@ find_package(MKL CONFIG
              NO_DEFAULT_PATH
              REQUIRED)
 
+message(STATUS "MKL Version: ${MKL_VERSION}")
 message(STATUS "MKL Targets: ${MKL_IMPORTED_TARGETS}")
 
 # -----------------------------------------------------------------------------
diff --git a/doc/arkode/guide/source/Butcher.rst b/doc/arkode/guide/source/Butcher.rst
index 5bfdd6169c..6666a4f7dd 100644
--- a/doc/arkode/guide/source/Butcher.rst
+++ b/doc/arkode/guide/source/Butcher.rst
@@ -180,6 +180,41 @@ This is the default 2nd order explicit method.
    region is outlined in blue; the embedding's region is in red.
 
 
+.. _Butcher.ARK2_ERK:
+
+ARK2-ERK-3-1-2
+^^^^^^^^^^^^^^
+
+.. index:: ARK2-ERK-3-1-2
+
+Accessible via the constant ``ARKODE_ARK2_ERK_3_1_2`` to
+:c:func:`ARKStepSetTableNum()`, :c:func:`ERKStepSetTableNum()` or
+:c:func:`ARKodeButcherTable_LoadERK()`.
+Accessible via the string ``"ARKODE_ARK2_ERK_3_1_2"`` to
+:c:func:`ARKStepSetTableName()`, :c:func:`ERKStepSetTableName()` or
+:c:func:`ARKodeButcherTable_LoadERKByName()`.
+This is the explicit portion of the default 2nd order additive method (the
+explicit portion of the ARK2 method from :cite:p:`giraldo2013implicit`).
+
+.. math::
+
+   \renewcommand{\arraystretch}{1.5}
+   \begin{array}{r|ccc}
+     0            & 0                           & 0                       & 0 \\
+     2 - \sqrt{2} & 2 - \sqrt{2}                & 0                       & 0 \\
+     1            & 1 - \frac{3 + 2\sqrt{2}}{6} & \frac{3 + 2\sqrt{2}}{6} & 0 \\
+     \hline
+     2 & \frac{1}{2\sqrt{2}}    & \frac{1}{2\sqrt{2}}    & 1 - \frac{1}{\sqrt{2}} \\
+     1 & \frac{4 - \sqrt{2}}{8} & \frac{4 - \sqrt{2}}{8} & \frac{1}{2\sqrt{2}}    \\
+   \end{array}
+
+.. figure:: /figs/arkode/ark2_erk_stab_region.png
+   :scale: 65 %
+   :align: center
+
+   Linear stability region for the ARK2-ERK method. The method's
+   region is outlined in blue; the embedding's region is in red.
+
 
 .. _Butcher.Bogacki_Shampine:
 
@@ -816,6 +851,41 @@ are A- and B-stable.
    region is outlined in blue; the embedding's region is in red.
 
 
+.. _Butcher.ARK2_DIRK:
+
+ARK2-DIRK-3-1-2
+^^^^^^^^^^^^^^^
+
+.. index:: ARK2-DIRK-3-1-2
+
+Accessible via the constant ``ARKODE_ARK2_DIRK_3_1_2`` to
+:c:func:`ARKStepSetTableNum()`, or
+:c:func:`ARKodeButcherTable_LoadDIRK()`.
+Accessible via the string ``"ARKODE_ARK2_DIRK_3_1_2"`` to
+:c:func:`ARKStepSetTableName()`, or
+:c:func:`ARKodeButcherTable_LoadDIRKByName()`.
+This is the implicit portion of the default 2nd order additive method (the
+implicit portion of the ARK2 method from :cite:p:`giraldo2013implicit`).
+
+.. math::
+
+   \renewcommand{\arraystretch}{1.5}
+   \begin{array}{r|ccc}
+     0            & 0                      & 0                      & 0 \\
+     2 - \sqrt{2} & 1 - \frac{1}{\sqrt{2}} & 1 - \frac{1}{\sqrt{2}} & 0 \\
+     1            & \frac{1}{2\sqrt{2}}    & \frac{1}{2\sqrt{2}}    & 1 - \frac{1}{\sqrt{2}} \\
+     \hline
+     2 & \frac{1}{2\sqrt{2}}    & \frac{1}{2\sqrt{2}}    & 1 - \frac{1}{\sqrt{2}} \\
+     1 & \frac{4 - \sqrt{2}}{8} & \frac{4 - \sqrt{2}}{8} & \frac{1}{2\sqrt{2}}    \\
+   \end{array}
+
+.. figure:: /figs/arkode/ark2_dirk_stab_region.png
+   :scale: 65 %
+   :align: center
+
+   Linear stability region for the ARK2-DIRK method. The method's
+   region is outlined in blue; the embedding's region is in red.
+
 
 .. _Butcher.Billington:
 
@@ -1590,10 +1660,16 @@ Additive Butcher tables
 ---------------------------
 
 In the category of additive Runge--Kutta methods for split implicit and
-explicit calculations, ARKODE includes methods that have orders 3
-through 5, with embeddings that are of orders 2 through 4.  These
+explicit calculations, ARKODE includes methods that have orders 2
+through 5, with embeddings that are of orders 1 through 4.  These
 Butcher table pairs are as follows:
 
+* :index:`2nd-order pair <ARK-3-1-2 ARK method>`:
+  :numref:`Butcher.ARK2_ERK` with :numref:`Butcher.ARK2_DIRK`,
+  corresponding to Butcher tables ``ARKODE_ARK2_ERK_3_1_2`` and
+  ``ARKODE_ARK2_DIRK_3_1_2`` for :c:func:`ARKStepSetTableNum()`
+  or :c:func:`ARKStepSetTableName()`.
+
 * :index:`3rd-order pair <ARK-4-2-3 ARK method>`:
   :numref:`Butcher.ARK_4_2_3_E` with :numref:`Butcher.ARK_4_2_3_I`,
   corresponding to Butcher tables ``ARKODE_ARK324L2SA_ERK_4_2_3`` and
diff --git a/doc/arkode/guide/source/Introduction.rst b/doc/arkode/guide/source/Introduction.rst
index 10174ccd8c..7e8d6d6fa6 100644
--- a/doc/arkode/guide/source/Introduction.rst
+++ b/doc/arkode/guide/source/Introduction.rst
@@ -118,6 +118,18 @@ provided with SUNDIALS, or again may utilize a user-supplied module.
 Changes from previous versions
 ==============================
 
+Changes in v5.6.0
+-----------------
+
+Added the second order IMEX method from :cite:p:`giraldo2013implicit` as the
+default second order IMEX method in ARKStep. The explicit table is given by
+``ARKODE_ARK2_ERK_3_1_2`` (see :numref:`Butcher.ARK2_ERK`) and the implicit
+table by ``ARKODE_ARK2_DIRK_3_1_2`` (see :numref:`Butcher.ARK2_DIRK`).
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
 Changes in v5.5.1
 -----------------
 
diff --git a/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst b/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst
index cd4acec555..97fa502b7e 100644
--- a/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst
+++ b/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst
@@ -1454,7 +1454,7 @@ Set additive RK tables via their names    :c:func:`ARKStepSetTableName()`    int
       For explicit methods, the allowed values are :math:`2 \le`
       *ord* :math:`\le 8`.  For implicit methods, the allowed values are
       :math:`2\le` *ord* :math:`\le 5`, and for ImEx methods the allowed
-      values are :math:`3 \le` *ord* :math:`\le 5`.  Any illegal input
+      values are :math:`2 \le` *ord* :math:`\le 5`.  Any illegal input
       will result in the default value of 4.
 
       Since *ord* affects the memory requirements for the internal
diff --git a/doc/cvode/guide/source/Introduction.rst b/doc/cvode/guide/source/Introduction.rst
index 496dfa5be6..debed4bd24 100644
--- a/doc/cvode/guide/source/Introduction.rst
+++ b/doc/cvode/guide/source/Introduction.rst
@@ -111,6 +111,13 @@ implementations.
 Changes from previous versions
 ==============================
 
+Changes in v6.6.0
+-----------------
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
 Changes in v6.5.1
 -----------------
 
@@ -145,7 +152,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w
 as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``,
 ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions.
 Compiling and running code that uses the SUNDIALS Fortran interfaces with
-bounds checking will now work. 
+bounds checking will now work.
 
 Changes in v6.4.1
 -----------------
diff --git a/doc/cvodes/guide/source/Introduction.rst b/doc/cvodes/guide/source/Introduction.rst
index aec32d3649..dd0384c250 100644
--- a/doc/cvodes/guide/source/Introduction.rst
+++ b/doc/cvodes/guide/source/Introduction.rst
@@ -111,6 +111,13 @@ Fortran.
 Changes from previous versions
 ==============================
 
+Changes in v6.6.0
+-----------------
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
 Changes in v6.5.1
 -----------------
 
@@ -146,7 +153,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w
 as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``,
 ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions.
 Compiling and running code that uses the SUNDIALS Fortran interfaces with
-bounds checking will now work. 
+bounds checking will now work.
 
 Changes in v6.4.1
 -----------------
diff --git a/doc/ida/guide/source/Introduction.rst b/doc/ida/guide/source/Introduction.rst
index 76458544b9..b2a5a15671 100644
--- a/doc/ida/guide/source/Introduction.rst
+++ b/doc/ida/guide/source/Introduction.rst
@@ -72,6 +72,13 @@ systems.
 Changes from previous versions
 ==============================
 
+Changes in v6.6.0
+-----------------
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
 Changes in v6.5.1
 -----------------
 
@@ -107,7 +114,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w
 as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``,
 ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions.
 Compiling and running code that uses the SUNDIALS Fortran interfaces with
-bounds checking will now work. 
+bounds checking will now work.
 
 Changes in v6.4.1
 -----------------
diff --git a/doc/idas/guide/source/Introduction.rst b/doc/idas/guide/source/Introduction.rst
index f5b828966d..a2fd55c464 100644
--- a/doc/idas/guide/source/Introduction.rst
+++ b/doc/idas/guide/source/Introduction.rst
@@ -86,6 +86,14 @@ integrate any final-condition ODE dependent on the solution of the original IVP
 Changes from previous versions
 ==============================
 
+Changes in v5.6.0
+-----------------
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
+
 Changes in v5.5.1
 -----------------
 
@@ -121,7 +129,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w
 as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``,
 ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions.
 Compiling and running code that uses the SUNDIALS Fortran interfaces with
-bounds checking will now work. 
+bounds checking will now work.
 
 Changes in v5.4.1
 -----------------
diff --git a/doc/kinsol/guide/source/Introduction.rst b/doc/kinsol/guide/source/Introduction.rst
index 6e6edcffcf..3b8f5a1f21 100644
--- a/doc/kinsol/guide/source/Introduction.rst
+++ b/doc/kinsol/guide/source/Introduction.rst
@@ -88,6 +88,14 @@ applications written in Fortran.
 Changes from previous versions
 ==============================
 
+Changes in v6.6.0
+-----------------
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
+
 Changes in v6.5.1
 -----------------
 
@@ -115,7 +123,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w
 as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``,
 ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions.
 Compiling and running code that uses the SUNDIALS Fortran interfaces with
-bounds checking will now work. 
+bounds checking will now work.
 
 Changes in v6.4.1
 -----------------
diff --git a/doc/shared/Install.rst b/doc/shared/Install.rst
index 7ad8c3bcd1..4682ba1906 100644
--- a/doc/shared/Install.rst
+++ b/doc/shared/Install.rst
@@ -776,6 +776,20 @@ illustration only.
 
    Default: none
 
+.. cmakeoption:: SUNDIALS_ONEMKL_USE_GETRF_LOOP
+
+   This advanced debugging option replaces the batched LU factorization with a
+   loop over each system in the batch and a non-batched LU factorization.
+
+   Default: OFF
+
+.. cmakeoption:: SUNDIALS_ONEMKL_USE_GETRS_LOOP
+
+   This advanced debugging option replaces the batched LU solve with a loop over
+   each system in the batch and a non-batched solve.
+
+   Default: OFF
+
 .. cmakeoption:: ENABLE_OPENMP
 
    Enable OpenMP support (build the OpenMP NVector)
@@ -944,6 +958,14 @@ illustration only.
       ``dpcpp`` and ``icpx``. When using ``icpx`` the ``-fsycl`` flag and any
       ahead of time compilation flags must be added to ``CMAKE_CXX_FLAGS``.
 
+.. cmakeoption:: SUNDIALS_SYCL_2020_UNSUPPORTED
+
+   This advanced option disables the use of *some* features from the SYCL 2020
+   standard in SUNDIALS libraries and examples. This can be used to work around
+   some cases of incomplete compiler support for SYCL 2020.
+
+   Default: OFF
+
 
 .. cmakeoption:: SUNDIALS_LOGGING_LEVEL
 
diff --git a/doc/shared/figs/arkode/ark2_dirk_stab_region.png b/doc/shared/figs/arkode/ark2_dirk_stab_region.png
new file mode 100644
index 0000000000..83929af11e
Binary files /dev/null and b/doc/shared/figs/arkode/ark2_dirk_stab_region.png differ
diff --git a/doc/shared/figs/arkode/ark2_erk_stab_region.png b/doc/shared/figs/arkode/ark2_erk_stab_region.png
new file mode 100644
index 0000000000..45b125d708
Binary files /dev/null and b/doc/shared/figs/arkode/ark2_erk_stab_region.png differ
diff --git a/doc/shared/sundials.bib b/doc/shared/sundials.bib
index 2851d968bd..d62f0d9c22 100644
--- a/doc/shared/sundials.bib
+++ b/doc/shared/sundials.bib
@@ -1784,6 +1784,19 @@ @techreport{Fehlberg:69
   year        = {1969}
 }
 
+
+@article{giraldo2013implicit,
+  title     = {Implicit-explicit formulations of a three-dimensional nonhydrostatic unified model of the atmosphere (NUMA)},
+  author    = {Giraldo, F. X. and Kelly, J. F. and Constantinescu, E. M.},
+  journal   = {SIAM Journal on Scientific Computing},
+  volume    = {35},
+  number    = {5},
+  pages     = {B1162--B1194},
+  year      = {2013},
+  publisher = {SIAM},
+  doi       = {10.1137/120876034}
+}
+
 @article{Gust:91,
   author  = {Gustafsson, K.},
   title   = {Control theoretic techniques for stepsize selection in explicit {Runge-Kutta} methods},
diff --git a/doc/shared/sundials/Fortran.rst b/doc/shared/sundials/Fortran.rst
index 20246ce8f8..bef4eb5aca 100644
--- a/doc/shared/sundials/Fortran.rst
+++ b/doc/shared/sundials/Fortran.rst
@@ -490,8 +490,10 @@ a C file pointer, SUNDIALS provides two utility functions for creating a
    the provided filename and I/O mode.
 
    **Arguments:**
-      * ``filename`` -- the full path to the file, that should have Fortran
-        type ``character(kind=C_CHAR, len=*)``.
+      * ``filename`` -- the path to the file, that should have Fortran
+        type ``character(kind=C_CHAR, len=*)``.  There are two special filenames:
+        ``stdout`` and ``stderr`` -- these two filenames will result in output
+        going to the standard output file and standard error file, respectively.
       * ``mode`` -- the I/O mode to use for the file.  This should have the
         Fortran type ``character(kind=C_CHAR, len=*)``.  The string begins
         with one of the following characters:
@@ -517,7 +519,9 @@ a C file pointer, SUNDIALS provides two utility functions for creating a
 
    **Arguments:**
       * ``fp`` -- the C ``FILE*`` that was previously obtained from ``fopen``.
-        This should have the Fortran type ``type(c_ptr)``.
+        This should have the Fortran type ``type(c_ptr)``.  Note that if either
+        ``stdout`` or ``stderr`` were opened using :c:func:`SUNDIALSFileOpen()`
+        then that stream *will not be closed* by this function.
 
 
 .. _SUNDIALS.Fortran.Portability:
diff --git a/doc/sundials_developers/source/benchmarks/diffusion.rst b/doc/sundials_developers/source/benchmarks/diffusion.rst
index 54b5b08e05..5c35ac4ec2 100644
--- a/doc/sundials_developers/source/benchmarks/diffusion.rst
+++ b/doc/sundials_developers/source/benchmarks/diffusion.rst
@@ -30,11 +30,11 @@ This code simulates the anisotropic 2D heat equation,
 
 .. math::
 
-    u_t = \nabla \cdot (D \nabla u) + b(t,\mathbf{x}),
+    \frac{\partial u}{\partial t} = \nabla \cdot (D \nabla u) + b(t,\mathbf{x}),
 
 where :math:`D` is a diagonal matrix with entries :math:`k_x` and :math:`k_y`.
 The system is evolved for :math:`t \in [0, t_f]` on the rectangular domain
-:math:`(x,y) \equiv \mathbf{x} \in [\mathbf{0}, \mathbf{x_{\text{max}}}]^2`,
+:math:`(x,y) \equiv \mathbf{x} \in [\mathbf{0}, \mathbf{x}_{\text{max}}]^2`,
 with the initial condition
 
 .. math::
@@ -45,8 +45,8 @@ and stationary boundary conditions
 
 .. math::
 
-   u_t(t,0,y) = u_t(t,x_{\text{max}},y) =
-   u_t(t,x,0) = u_t(t,x,y_{\text{max}}) = 0.
+   \frac{\partial u}{\partial t}(t,0,y) = \frac{\partial u}{\partial t}(t,x_{\text{max}},y) =
+   \frac{\partial u}{\partial t}(t,x,0) = \frac{\partial u}{\partial t}(t,x,y_{\text{max}}) = 0.
 
 The source term is given by
 
@@ -68,9 +68,12 @@ IDA. With ARKODE, an adaptive step diagonally implicit Runge-Kutta (DIRK) method
 is applied. When using CVODE or IDA, adaptive order and step BDF methods are
 used.
 
-In all cases, the nonlinear system(s) in each time step are solved using an
-inexact Newton method paired with a matrix-free PCG or GMRES linear solver and a
-Jacobi preconditioner.
+By default, the nonlinear system(s) in each time step are solved using an
+inexact Newton method paired with a matrix-free CG linear solver and a Jacobi
+preconditioner. A matrix-free GMRES linear solver may be selected at run time.
+If SUNDIALS is built with the SuperLU_DIST interface enabled a modified Newton
+method with SuperLU_DIST as the direct linear solver may also be selected at run
+time.
 
 
 Options
@@ -143,7 +146,8 @@ listed in :numref:`Benchmarks.Table.2D_diffusion_options`.
    |                               | ``ONE_STEP`` mode for debugging      |               |
    |                               | (0 uses ``NORMAL`` mode)             |               |
    +-------------------------------+--------------------------------------+---------------+
-   | ``--gmres``                   | Use GMRES rather than PCG            | PCG           |
+   | ``--ls``                      | Linear solver: ``cg``, ``gmres``,    | ``cg``        |
+   |                               | ``sludist``                          |               |
    +-------------------------------+--------------------------------------+---------------+
    | ``--lsinfo``                  | Output linear solver diagnostics     | Off           |
    +-------------------------------+--------------------------------------+---------------+
@@ -169,22 +173,33 @@ listed in :numref:`Benchmarks.Table.2D_diffusion_options`.
    +-------------------------------+--------------------------------------+---------------+
 
 
-Building and Running
-^^^^^^^^^^^^^^^^^^^^
+Building
+^^^^^^^^
 
 To build the benchmark executables SUNDIALS should be configured with ARKODE,
-CVODE, or IDA enabled and with MPI support on. Additionally, either CUDA or HIP
-support must be on to build executables utilizing NVIDIA or AMD GPUs. See the
-installation guide for more details on configuring, building, and installing
-SUNDIALS.
+CVODE, or IDA enabled, MPI support turned on, and benchmarks enabled. If
+SUNDIALS is configured with SuperLU_DIST enabled this linear solver can be
+selected at run time and may utilize OpenMP, CUDA, or ROCM (HIP) for on-node
+parallelism. If SUNDIALS is configured with CUDA or HIP support enabled
+additional executables utilizing CUDA and HIP will be built. See the SUNDIALS
+installation guide for more details on configuring, building, and installing.
+
+Running
+^^^^^^^
 
 Based on the configuration, executables for each integrator and backend option
-are built and installed in the ``<install prefix>/bin/benchmarks/diffusion_2D``
-directory. The executables follow the naming convention
-``<package>_diffusion_2D_<parallelism>`` where ``<package>`` is ``arkode``,
+are built and installed in ``<BENCHMARKS_INSTALL_PATH>/diffusion_2D``. The
+executables follow the naming convention
+```<package>_diffusion_2D_<parallelism>`` where ``<package>`` is ``arkode``,
 ``cvode``, or ``ida`` and ``<parallelism>`` is ``mpi`` for MPI only parallelism,
 ``mpicuda`` for MPI + CUDA, and ``mpihip`` for MPI + HIP.
 
+.. note::
+
+   When using the SuperLU_DIST linear solver computations will be offloaded to
+   the GPU in the MPI only executables if CUDA or ROCM support is enabled in
+   SuperLU_DIST.
+
 On Summit, with the default environment
 
 * Compiler: xl/16.1.1-5
@@ -209,3 +224,15 @@ an example ``jsrun`` command using CUDA-aware MPI
 .. code-block:: none
 
    jsrun -n 2 -a 1 -c 1 -g 1 ./cvode_diffusion_2D_mpicuda
+
+On Crusher, with the environment
+
+* Compiler: clang/14.0.2
+* MPI: cray-mpich/8.1.17
+* ROCM: rocm/5.2.0
+
+an example ``srun`` command is
+
+.. code-block:: none
+
+   srun -N1 -n8 -c1 --gpus-per-node=8 --gpu-bind=closest ./cvode_diffusion_2D_mpi
diff --git a/doc/sundials_developers/source/testing/CI.rst b/doc/sundials_developers/source/testing/CI.rst
index 928564e70a..52626c5301 100644
--- a/doc/sundials_developers/source/testing/CI.rst
+++ b/doc/sundials_developers/source/testing/CI.rst
@@ -275,11 +275,11 @@ Caliper, as we need a newer version than in the Spack commit currently used.
 Updating Spack
 --------------
 
-To update the spack commit used for the CI, the first thing to do is update
-the spack commit in the ``.uberenv_config.json`` file. Then, a pipeline
-should be manually launched with the ``SHARED_SPACK`` CI variable set
-to ``ON``. This will cause Spack to re-concretize the specs and ideally
-update to newer packages. However, there is no guarantee that individual
-dependencies will be updated, so due dilligence is required (i.e., ensure
-that the output from the CI job shows that Spack has selected the versions
-of dependencies that you expected).
+To update the spack commit used for the CI:
+
+1. The first thing to do is update the spack commit in the ``.uberenv_config.json`` file.
+2. Then, a pipeline should be manually launched from the GitLab UI with the ``SHARED_SPACK`` CI variable set
+to ``ON`` and the ``SPACK_PREFIX`` variable to the version of spack being set in the uberenv_config.json. 
+
+This will create a new spack installation and rebuild all of the specs. 
+
diff --git a/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp b/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp
index 6c176ee53e..45eae942b2 100644
--- a/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp
+++ b/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp
@@ -129,7 +129,7 @@ int main(int argc, char *argv[])
   if (argc > 3) output = (atoi(argv[3])) ? true : false;
 
   // Create an in-order GPU queue
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else
diff --git a/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp b/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp
index dc3286701e..44fe80d7b7 100644
--- a/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp
+++ b/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp
@@ -119,7 +119,7 @@ int main(int argc, char** argv)
   int retval;
 
   // Create an in-order GPU queue
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else
diff --git a/examples/cvode/kokkos/CMakeLists.txt b/examples/cvode/kokkos/CMakeLists.txt
index 84d0e1528f..3b20ec3cd4 100644
--- a/examples/cvode/kokkos/CMakeLists.txt
+++ b/examples/cvode/kokkos/CMakeLists.txt
@@ -16,6 +16,7 @@
 # 'develop' for examples excluded from 'make test' in releases
 set(examples_list
   "cv_bruss_batched_kokkos.cpp\;\;develop"
+  "cv_bruss_batched_kokkos_2D.cpp\;\;develop"
 )
 
 # Add the build targets for each example
diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out
new file mode 100644
index 0000000000..6f2c19c3a9
--- /dev/null
+++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out
@@ -0,0 +1,137 @@
+
+Batch of independent 3-species kinetics problems
+  number of batches = 100
+  linear solver     = KokkosKernels
+  test type         = 2
+  execution space   = Cuda
+
+At t = 0
+  batch 0: y = 1.2 3.1 3
+  batch 10: y = 1.2 3.1 3
+  batch 20: y = 1.2 3.1 3
+  batch 30: y = 1.2 3.1 3
+  batch 40: y = 1.2 3.1 3
+  batch 50: y = 1.2 3.1 3
+  batch 60: y = 1.2 3.1 3
+  batch 70: y = 1.2 3.1 3
+  batch 80: y = 1.2 3.1 3
+  batch 90: y = 1.2 3.1 3
+At t = 1
+  batch 0: y = 1.10389 3.01314 3.49998
+  batch 10: y = 1.10389 3.01314 3.49998
+  batch 20: y = 1.10389 3.01314 3.49998
+  batch 30: y = 1.10389 3.01314 3.49998
+  batch 40: y = 1.10389 3.01314 3.49998
+  batch 50: y = 1.10389 3.01314 3.49998
+  batch 60: y = 1.10389 3.01314 3.49998
+  batch 70: y = 1.10389 3.01314 3.49998
+  batch 80: y = 1.10389 3.01314 3.49998
+  batch 90: y = 1.10389 3.01314 3.49998
+At t = 2
+  batch 0: y = 0.688033 3.5213 3.49999
+  batch 10: y = 0.688033 3.5213 3.49999
+  batch 20: y = 0.688033 3.5213 3.49999
+  batch 30: y = 0.688033 3.5213 3.49999
+  batch 40: y = 0.688033 3.5213 3.49999
+  batch 50: y = 0.688033 3.5213 3.49999
+  batch 60: y = 0.688033 3.5213 3.49999
+  batch 70: y = 0.688033 3.5213 3.49999
+  batch 80: y = 0.688033 3.5213 3.49999
+  batch 90: y = 0.688033 3.5213 3.49999
+At t = 3
+  batch 0: y = 0.409472 4.27781 3.49999
+  batch 10: y = 0.409472 4.27781 3.49999
+  batch 20: y = 0.409472 4.27781 3.49999
+  batch 30: y = 0.409472 4.27781 3.49999
+  batch 40: y = 0.409472 4.27781 3.49999
+  batch 50: y = 0.409472 4.27781 3.49999
+  batch 60: y = 0.409472 4.27781 3.49999
+  batch 70: y = 0.409472 4.27781 3.49999
+  batch 80: y = 0.409472 4.27781 3.49999
+  batch 90: y = 0.409472 4.27781 3.49999
+At t = 4
+  batch 0: y = 0.36788 4.94194 3.49999
+  batch 10: y = 0.36788 4.94194 3.49999
+  batch 20: y = 0.36788 4.94194 3.49999
+  batch 30: y = 0.36788 4.94194 3.49999
+  batch 40: y = 0.36788 4.94194 3.49999
+  batch 50: y = 0.36788 4.94194 3.49999
+  batch 60: y = 0.36788 4.94194 3.49999
+  batch 70: y = 0.36788 4.94194 3.49999
+  batch 80: y = 0.36788 4.94194 3.49999
+  batch 90: y = 0.36788 4.94194 3.49999
+At t = 5
+  batch 0: y = 0.413842 5.51057 3.49999
+  batch 10: y = 0.413842 5.51057 3.49999
+  batch 20: y = 0.413842 5.51057 3.49999
+  batch 30: y = 0.413842 5.51057 3.49999
+  batch 40: y = 0.413842 5.51057 3.49999
+  batch 50: y = 0.413842 5.51057 3.49999
+  batch 60: y = 0.413842 5.51057 3.49999
+  batch 70: y = 0.413842 5.51057 3.49999
+  batch 80: y = 0.413842 5.51057 3.49999
+  batch 90: y = 0.413842 5.51057 3.49999
+At t = 6
+  batch 0: y = 0.589207 5.85566 3.49999
+  batch 10: y = 0.589207 5.85566 3.49999
+  batch 20: y = 0.589207 5.85566 3.49999
+  batch 30: y = 0.589207 5.85566 3.49999
+  batch 40: y = 0.589207 5.85566 3.49999
+  batch 50: y = 0.589207 5.85566 3.49999
+  batch 60: y = 0.589207 5.85566 3.49999
+  batch 70: y = 0.589207 5.85566 3.49999
+  batch 80: y = 0.589207 5.85566 3.49999
+  batch 90: y = 0.589207 5.85566 3.49999
+At t = 7
+  batch 0: y = 4.75675 0.735405 3.49992
+  batch 10: y = 4.75675 0.735405 3.49992
+  batch 20: y = 4.75675 0.735405 3.49992
+  batch 30: y = 4.75675 0.735405 3.49992
+  batch 40: y = 4.75675 0.735405 3.49992
+  batch 50: y = 4.75675 0.735405 3.49992
+  batch 60: y = 4.75675 0.735405 3.49992
+  batch 70: y = 4.75675 0.735405 3.49992
+  batch 80: y = 4.75675 0.735405 3.49992
+  batch 90: y = 4.75675 0.735405 3.49992
+At t = 8
+  batch 0: y = 1.81355 1.57573 3.49997
+  batch 10: y = 1.81355 1.57573 3.49997
+  batch 20: y = 1.81355 1.57573 3.49997
+  batch 30: y = 1.81355 1.57573 3.49997
+  batch 40: y = 1.81355 1.57573 3.49997
+  batch 50: y = 1.81355 1.57573 3.49997
+  batch 60: y = 1.81355 1.57573 3.49997
+  batch 70: y = 1.81355 1.57573 3.49997
+  batch 80: y = 1.81355 1.57573 3.49997
+  batch 90: y = 1.81355 1.57573 3.49997
+At t = 9
+  batch 0: y = 0.527935 2.80731 3.49999
+  batch 10: y = 0.527935 2.80731 3.49999
+  batch 20: y = 0.527935 2.80731 3.49999
+  batch 30: y = 0.527935 2.80731 3.49999
+  batch 40: y = 0.527935 2.80731 3.49999
+  batch 50: y = 0.527935 2.80731 3.49999
+  batch 60: y = 0.527935 2.80731 3.49999
+  batch 70: y = 0.527935 2.80731 3.49999
+  batch 80: y = 0.527935 2.80731 3.49999
+  batch 90: y = 0.527935 2.80731 3.49999
+At t = 10
+  batch 0: y = 0.305602 3.65734 3.49999
+  batch 10: y = 0.305602 3.65734 3.49999
+  batch 20: y = 0.305602 3.65734 3.49999
+  batch 30: y = 0.305602 3.65734 3.49999
+  batch 40: y = 0.305602 3.65734 3.49999
+  batch 50: y = 0.305602 3.65734 3.49999
+  batch 60: y = 0.305602 3.65734 3.49999
+  batch 70: y = 0.305602 3.65734 3.49999
+  batch 80: y = 0.305602 3.65734 3.49999
+  batch 90: y = 0.305602 3.65734 3.49999
+
+Final Statistics:
+  Steps            = 344
+  RHS evals        = 464
+  LS setups        = 59
+  Jac evals        = 7
+  NLS iters        = 461
+  NLS fails        = 1
+  Error test fails = 20
diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out
new file mode 100644
index 0000000000..69f0b74a18
--- /dev/null
+++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out
@@ -0,0 +1,137 @@
+
+Batch of independent 3-species kinetics problems
+  number of batches = 100
+  linear solver     = KokkosKernels
+  test type         = 2
+  execution space   = OpenMP
+
+At t = 0
+  batch 0: y = 1.2 3.1 3
+  batch 10: y = 1.2 3.1 3
+  batch 20: y = 1.2 3.1 3
+  batch 30: y = 1.2 3.1 3
+  batch 40: y = 1.2 3.1 3
+  batch 50: y = 1.2 3.1 3
+  batch 60: y = 1.2 3.1 3
+  batch 70: y = 1.2 3.1 3
+  batch 80: y = 1.2 3.1 3
+  batch 90: y = 1.2 3.1 3
+At t = 1
+  batch 0: y = 1.10389 3.01314 3.49998
+  batch 10: y = 1.10389 3.01314 3.49998
+  batch 20: y = 1.10389 3.01314 3.49998
+  batch 30: y = 1.10389 3.01314 3.49998
+  batch 40: y = 1.10389 3.01314 3.49998
+  batch 50: y = 1.10389 3.01314 3.49998
+  batch 60: y = 1.10389 3.01314 3.49998
+  batch 70: y = 1.10389 3.01314 3.49998
+  batch 80: y = 1.10389 3.01314 3.49998
+  batch 90: y = 1.10389 3.01314 3.49998
+At t = 2
+  batch 0: y = 0.688033 3.5213 3.49999
+  batch 10: y = 0.688033 3.5213 3.49999
+  batch 20: y = 0.688033 3.5213 3.49999
+  batch 30: y = 0.688033 3.5213 3.49999
+  batch 40: y = 0.688033 3.5213 3.49999
+  batch 50: y = 0.688033 3.5213 3.49999
+  batch 60: y = 0.688033 3.5213 3.49999
+  batch 70: y = 0.688033 3.5213 3.49999
+  batch 80: y = 0.688033 3.5213 3.49999
+  batch 90: y = 0.688033 3.5213 3.49999
+At t = 3
+  batch 0: y = 0.409472 4.27781 3.49999
+  batch 10: y = 0.409472 4.27781 3.49999
+  batch 20: y = 0.409472 4.27781 3.49999
+  batch 30: y = 0.409472 4.27781 3.49999
+  batch 40: y = 0.409472 4.27781 3.49999
+  batch 50: y = 0.409472 4.27781 3.49999
+  batch 60: y = 0.409472 4.27781 3.49999
+  batch 70: y = 0.409472 4.27781 3.49999
+  batch 80: y = 0.409472 4.27781 3.49999
+  batch 90: y = 0.409472 4.27781 3.49999
+At t = 4
+  batch 0: y = 0.36788 4.94194 3.49999
+  batch 10: y = 0.36788 4.94194 3.49999
+  batch 20: y = 0.36788 4.94194 3.49999
+  batch 30: y = 0.36788 4.94194 3.49999
+  batch 40: y = 0.36788 4.94194 3.49999
+  batch 50: y = 0.36788 4.94194 3.49999
+  batch 60: y = 0.36788 4.94194 3.49999
+  batch 70: y = 0.36788 4.94194 3.49999
+  batch 80: y = 0.36788 4.94194 3.49999
+  batch 90: y = 0.36788 4.94194 3.49999
+At t = 5
+  batch 0: y = 0.413842 5.51057 3.49999
+  batch 10: y = 0.413842 5.51057 3.49999
+  batch 20: y = 0.413842 5.51057 3.49999
+  batch 30: y = 0.413842 5.51057 3.49999
+  batch 40: y = 0.413842 5.51057 3.49999
+  batch 50: y = 0.413842 5.51057 3.49999
+  batch 60: y = 0.413842 5.51057 3.49999
+  batch 70: y = 0.413842 5.51057 3.49999
+  batch 80: y = 0.413842 5.51057 3.49999
+  batch 90: y = 0.413842 5.51057 3.49999
+At t = 6
+  batch 0: y = 0.589207 5.85566 3.49999
+  batch 10: y = 0.589207 5.85566 3.49999
+  batch 20: y = 0.589207 5.85566 3.49999
+  batch 30: y = 0.589207 5.85566 3.49999
+  batch 40: y = 0.589207 5.85566 3.49999
+  batch 50: y = 0.589207 5.85566 3.49999
+  batch 60: y = 0.589207 5.85566 3.49999
+  batch 70: y = 0.589207 5.85566 3.49999
+  batch 80: y = 0.589207 5.85566 3.49999
+  batch 90: y = 0.589207 5.85566 3.49999
+At t = 7
+  batch 0: y = 4.75675 0.735405 3.49992
+  batch 10: y = 4.75675 0.735405 3.49992
+  batch 20: y = 4.75675 0.735405 3.49992
+  batch 30: y = 4.75675 0.735405 3.49992
+  batch 40: y = 4.75675 0.735405 3.49992
+  batch 50: y = 4.75675 0.735405 3.49992
+  batch 60: y = 4.75675 0.735405 3.49992
+  batch 70: y = 4.75675 0.735405 3.49992
+  batch 80: y = 4.75675 0.735405 3.49992
+  batch 90: y = 4.75675 0.735405 3.49992
+At t = 8
+  batch 0: y = 1.81355 1.57573 3.49997
+  batch 10: y = 1.81355 1.57573 3.49997
+  batch 20: y = 1.81355 1.57573 3.49997
+  batch 30: y = 1.81355 1.57573 3.49997
+  batch 40: y = 1.81355 1.57573 3.49997
+  batch 50: y = 1.81355 1.57573 3.49997
+  batch 60: y = 1.81355 1.57573 3.49997
+  batch 70: y = 1.81355 1.57573 3.49997
+  batch 80: y = 1.81355 1.57573 3.49997
+  batch 90: y = 1.81355 1.57573 3.49997
+At t = 9
+  batch 0: y = 0.527935 2.80731 3.49999
+  batch 10: y = 0.527935 2.80731 3.49999
+  batch 20: y = 0.527935 2.80731 3.49999
+  batch 30: y = 0.527935 2.80731 3.49999
+  batch 40: y = 0.527935 2.80731 3.49999
+  batch 50: y = 0.527935 2.80731 3.49999
+  batch 60: y = 0.527935 2.80731 3.49999
+  batch 70: y = 0.527935 2.80731 3.49999
+  batch 80: y = 0.527935 2.80731 3.49999
+  batch 90: y = 0.527935 2.80731 3.49999
+At t = 10
+  batch 0: y = 0.305602 3.65734 3.49999
+  batch 10: y = 0.305602 3.65734 3.49999
+  batch 20: y = 0.305602 3.65734 3.49999
+  batch 30: y = 0.305602 3.65734 3.49999
+  batch 40: y = 0.305602 3.65734 3.49999
+  batch 50: y = 0.305602 3.65734 3.49999
+  batch 60: y = 0.305602 3.65734 3.49999
+  batch 70: y = 0.305602 3.65734 3.49999
+  batch 80: y = 0.305602 3.65734 3.49999
+  batch 90: y = 0.305602 3.65734 3.49999
+
+Final Statistics:
+  Steps            = 344
+  RHS evals        = 464
+  LS setups        = 59
+  Jac evals        = 7
+  NLS iters        = 461
+  NLS fails        = 1
+  Error test fails = 20
diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out
new file mode 100644
index 0000000000..6cabd0d57d
--- /dev/null
+++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out
@@ -0,0 +1,137 @@
+
+Batch of independent 3-species kinetics problems
+  number of batches = 100
+  linear solver     = KokkosKernels
+  test type         = 2
+  execution space   = Serial
+
+At t = 0
+  batch 0: y = 1.2 3.1 3
+  batch 10: y = 1.2 3.1 3
+  batch 20: y = 1.2 3.1 3
+  batch 30: y = 1.2 3.1 3
+  batch 40: y = 1.2 3.1 3
+  batch 50: y = 1.2 3.1 3
+  batch 60: y = 1.2 3.1 3
+  batch 70: y = 1.2 3.1 3
+  batch 80: y = 1.2 3.1 3
+  batch 90: y = 1.2 3.1 3
+At t = 1
+  batch 0: y = 1.10389 3.01314 3.49998
+  batch 10: y = 1.10389 3.01314 3.49998
+  batch 20: y = 1.10389 3.01314 3.49998
+  batch 30: y = 1.10389 3.01314 3.49998
+  batch 40: y = 1.10389 3.01314 3.49998
+  batch 50: y = 1.10389 3.01314 3.49998
+  batch 60: y = 1.10389 3.01314 3.49998
+  batch 70: y = 1.10389 3.01314 3.49998
+  batch 80: y = 1.10389 3.01314 3.49998
+  batch 90: y = 1.10389 3.01314 3.49998
+At t = 2
+  batch 0: y = 0.688033 3.5213 3.49999
+  batch 10: y = 0.688033 3.5213 3.49999
+  batch 20: y = 0.688033 3.5213 3.49999
+  batch 30: y = 0.688033 3.5213 3.49999
+  batch 40: y = 0.688033 3.5213 3.49999
+  batch 50: y = 0.688033 3.5213 3.49999
+  batch 60: y = 0.688033 3.5213 3.49999
+  batch 70: y = 0.688033 3.5213 3.49999
+  batch 80: y = 0.688033 3.5213 3.49999
+  batch 90: y = 0.688033 3.5213 3.49999
+At t = 3
+  batch 0: y = 0.409472 4.27781 3.49999
+  batch 10: y = 0.409472 4.27781 3.49999
+  batch 20: y = 0.409472 4.27781 3.49999
+  batch 30: y = 0.409472 4.27781 3.49999
+  batch 40: y = 0.409472 4.27781 3.49999
+  batch 50: y = 0.409472 4.27781 3.49999
+  batch 60: y = 0.409472 4.27781 3.49999
+  batch 70: y = 0.409472 4.27781 3.49999
+  batch 80: y = 0.409472 4.27781 3.49999
+  batch 90: y = 0.409472 4.27781 3.49999
+At t = 4
+  batch 0: y = 0.36788 4.94194 3.49999
+  batch 10: y = 0.36788 4.94194 3.49999
+  batch 20: y = 0.36788 4.94194 3.49999
+  batch 30: y = 0.36788 4.94194 3.49999
+  batch 40: y = 0.36788 4.94194 3.49999
+  batch 50: y = 0.36788 4.94194 3.49999
+  batch 60: y = 0.36788 4.94194 3.49999
+  batch 70: y = 0.36788 4.94194 3.49999
+  batch 80: y = 0.36788 4.94194 3.49999
+  batch 90: y = 0.36788 4.94194 3.49999
+At t = 5
+  batch 0: y = 0.413842 5.51057 3.49999
+  batch 10: y = 0.413842 5.51057 3.49999
+  batch 20: y = 0.413842 5.51057 3.49999
+  batch 30: y = 0.413842 5.51057 3.49999
+  batch 40: y = 0.413842 5.51057 3.49999
+  batch 50: y = 0.413842 5.51057 3.49999
+  batch 60: y = 0.413842 5.51057 3.49999
+  batch 70: y = 0.413842 5.51057 3.49999
+  batch 80: y = 0.413842 5.51057 3.49999
+  batch 90: y = 0.413842 5.51057 3.49999
+At t = 6
+  batch 0: y = 0.589207 5.85566 3.49999
+  batch 10: y = 0.589207 5.85566 3.49999
+  batch 20: y = 0.589207 5.85566 3.49999
+  batch 30: y = 0.589207 5.85566 3.49999
+  batch 40: y = 0.589207 5.85566 3.49999
+  batch 50: y = 0.589207 5.85566 3.49999
+  batch 60: y = 0.589207 5.85566 3.49999
+  batch 70: y = 0.589207 5.85566 3.49999
+  batch 80: y = 0.589207 5.85566 3.49999
+  batch 90: y = 0.589207 5.85566 3.49999
+At t = 7
+  batch 0: y = 4.75675 0.735405 3.49992
+  batch 10: y = 4.75675 0.735405 3.49992
+  batch 20: y = 4.75675 0.735405 3.49992
+  batch 30: y = 4.75675 0.735405 3.49992
+  batch 40: y = 4.75675 0.735405 3.49992
+  batch 50: y = 4.75675 0.735405 3.49992
+  batch 60: y = 4.75675 0.735405 3.49992
+  batch 70: y = 4.75675 0.735405 3.49992
+  batch 80: y = 4.75675 0.735405 3.49992
+  batch 90: y = 4.75675 0.735405 3.49992
+At t = 8
+  batch 0: y = 1.81355 1.57573 3.49997
+  batch 10: y = 1.81355 1.57573 3.49997
+  batch 20: y = 1.81355 1.57573 3.49997
+  batch 30: y = 1.81355 1.57573 3.49997
+  batch 40: y = 1.81355 1.57573 3.49997
+  batch 50: y = 1.81355 1.57573 3.49997
+  batch 60: y = 1.81355 1.57573 3.49997
+  batch 70: y = 1.81355 1.57573 3.49997
+  batch 80: y = 1.81355 1.57573 3.49997
+  batch 90: y = 1.81355 1.57573 3.49997
+At t = 9
+  batch 0: y = 0.527935 2.80731 3.49999
+  batch 10: y = 0.527935 2.80731 3.49999
+  batch 20: y = 0.527935 2.80731 3.49999
+  batch 30: y = 0.527935 2.80731 3.49999
+  batch 40: y = 0.527935 2.80731 3.49999
+  batch 50: y = 0.527935 2.80731 3.49999
+  batch 60: y = 0.527935 2.80731 3.49999
+  batch 70: y = 0.527935 2.80731 3.49999
+  batch 80: y = 0.527935 2.80731 3.49999
+  batch 90: y = 0.527935 2.80731 3.49999
+At t = 10
+  batch 0: y = 0.305602 3.65734 3.49999
+  batch 10: y = 0.305602 3.65734 3.49999
+  batch 20: y = 0.305602 3.65734 3.49999
+  batch 30: y = 0.305602 3.65734 3.49999
+  batch 40: y = 0.305602 3.65734 3.49999
+  batch 50: y = 0.305602 3.65734 3.49999
+  batch 60: y = 0.305602 3.65734 3.49999
+  batch 70: y = 0.305602 3.65734 3.49999
+  batch 80: y = 0.305602 3.65734 3.49999
+  batch 90: y = 0.305602 3.65734 3.49999
+
+Final Statistics:
+  Steps            = 344
+  RHS evals        = 464
+  LS setups        = 59
+  Jac evals        = 7
+  NLS iters        = 461
+  NLS fails        = 1
+  Error test fails = 20
diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp
new file mode 100644
index 0000000000..58a136a74f
--- /dev/null
+++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp
@@ -0,0 +1,425 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Daniel R. Reynolds @ SMU
+ *                David J. Gardner and Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------
+ * The following is a simple example problem based off of ark_brusselator.c.
+ *
+ * We simulate a scenario where a set of independent ODEs are batched together
+ * to form a larger system. Each independent ODE system has 3 components,
+ * Y = [u, v, w], satisfying the equations,
+ *
+ *   du/dt = a - (w + 1) * u + v * u^2
+ *   dv/dt = w * u - v * u^2
+ *   dw/dt = (b - w) / ep - w * u
+ *
+ * for t in the interval [0, 10], with initial conditions Y0 = [u0, v0, w0].
+ * The problem is stiff and there are 3 testing scenarios:
+ *
+ * Reactor 0: u0 = 3.9, v0 = 1.1, w0 = 2.8, a = 1.2, b = 2.5, ep = 1.0e-5
+ *   Here, all three components exhibit a rapid transient change during the
+ *   first 0.2 time units, followed by a slow and smooth evolution.
+ *
+ * Reactor 1: u0 = 3, v0 = 3, w0 = 3.5, a = 0.5, b = 3, ep = 5.0e-4
+ *   Here, all components undergo very rapid initial transients during the first
+ *   0.3 time units, and all then proceed very smoothly for the remainder of the
+ *   simulation.
+ *
+ * Reactor 2: u0 = 1.2, v0 = 3.1, w0 = 3, a = 1, b = 3.5, ep = 5.0e-6
+ *   Here, w experiences a fast initial transient, jumping 0.5 within a few
+ *   steps. All values proceed smoothly until around t=6.5, when both u and v
+ *   undergo a sharp transition, with u increasing from around 0.5 to 5 and v
+ *   decreasing from around 6 to 1 in less than 0.5 time units. After this
+ *   transition, both u and v continue to evolve somewhat rapidly for another
+ *   1.4 time units, and finish off smoothly.
+ *
+ * This program solves the problem with the BDF method, Newton iteration, a
+ * user-supplied Jacobian routine, and, since the grouping of the independent
+ * systems results in a block diagonal linear system, the dense KOKKOS
+ * SUNLinearSolver which supports batched systems. 100 outputs are printed at
+ * equal intervals, and run statistics are printed at the end.
+ *
+ * Unlike the example cv_bruss_batched_kokkos.cpp, this example utilizes Kokkos'
+ * multi-dimensional view functionality to consider a 2D grouping, y(i,j), where
+ * i corresponds with the batch index, and j corresponds to the component (u,v,w).
+ *
+ * The program takes three optional arguments, the number of independent ODE
+ * systems (i.e., number of batches), the linear solver type (KOKKOS batched LU
+ * or non-batched GMRES with the Jacobian computed by difference quotients)
+ * the test type (uniform_0, uniform_1, or  uniform_2).
+ *
+ *   ./cv_bruss_batched_kokkos [num_batches] [solver_type] [test_type]
+ *
+ * Options:
+ *   num_batches <int>
+ *   solver_type:
+ *     0 - KOKKOS batched LU (default)
+ *     1 - SUNDIALS non-batched GMRES with difference quotients Jacobian
+ *   test_type:
+ *     0 - uniform_0, all batches are Reactor 0
+ *     1 - uniform 1, all batches are Reactor 1
+ *     2 - uniform 2, all batches are Reactor 2 (default)
+ * ---------------------------------------------------------------------------*/
+
+#include <cstdio>
+#include <cvode/cvode.h>
+#include <memory>
+#include <nvector/nvector_kokkos.hpp>
+#include <sunlinsol/sunlinsol_kokkosdense.hpp>
+#include <sunlinsol/sunlinsol_spgmr.h>
+#include <sunmatrix/sunmatrix_kokkosdense.hpp>
+#include <vector>
+
+// Common utility functions
+#include <example_utilities.hpp>
+
+// Execution space
+#if defined(USE_CUDA)
+using ExecSpace = Kokkos::Cuda;
+using MemSpace  = Kokkos::CudaSpace;
+#elif defined(USE_HIP)
+#if KOKKOS_VERSION / 10000 > 3
+using ExecSpace = Kokkos::HIP;
+using MemSpace  = Kokkos::HIPSpace;
+#else
+using ExecSpace = Kokkos::Experimental::HIP;
+using MemSpace  = Kokkos::Experimental::HIPSpace;
+#endif
+#elif defined(USE_OPENMP)
+using ExecSpace = Kokkos::OpenMP;
+using MemSpace  = Kokkos::HostSpace;
+#else
+using ExecSpace = Kokkos::Serial;
+using MemSpace  = Kokkos::HostSpace;
+#endif
+
+using Vec1D     = Kokkos::View<realtype*, MemSpace>;
+using Vec2D     = Kokkos::View<realtype**, Kokkos::LayoutRight, MemSpace>;
+using Vec2DHost = Vec2D::HostMirror;
+using VecType   = sundials::kokkos::Vector<ExecSpace>;
+using MatType   = sundials::kokkos::DenseMatrix<ExecSpace>;
+using LSType    = sundials::kokkos::DenseLinearSolver<ExecSpace>;
+using SizeType  = VecType::size_type;
+
+// Constants
+#define ZERO SUN_RCONST(0.0)
+#define ONE  SUN_RCONST(1.0)
+#define TWO  SUN_RCONST(2.0)
+
+// User-supplied functions called by CVODE
+static int f(sunrealtype t, N_Vector y, N_Vector ydot, void* user_data);
+
+static int Jac(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J,
+               void* user_data, N_Vector tmp1, N_Vector tmp2, N_Vector tmp3);
+
+// User data structure available in user-supplied callback functions
+struct UserData
+{
+  int nbatches  = 100; // number of chemical networks
+  int batchSize = 3;   // size of each network
+  sunrealtype a, b;    // chemical concentrations that are constant
+  sunrealtype ep;      // stiffness parameter
+};
+
+/* -----------------------------------------------------------------------------
+ * Main Program
+ * ---------------------------------------------------------------------------*/
+
+int main(int argc, char* argv[])
+{
+  // Create the SUNDIALS context
+  sundials::Context sunctx;
+
+  Kokkos::initialize(argc, argv);
+  {
+    // Create UserData
+    UserData udata;
+
+    // Parse command line options
+    int argi = 0;
+
+    // Total number of batch systems
+    if (argc > 1) udata.nbatches = atoi(argv[++argi]);
+
+    // Linear solver type
+    int solver_type = 0;
+    if (argc > 2) solver_type = atoi(argv[++argi]);
+
+    // Problem setup
+    int test_type = 2;
+    if (argc > 3) test_type = atoi(argv[++argi]);
+
+    // Shortcuts
+    int nbatches  = udata.nbatches;
+    int batchSize = udata.batchSize;
+
+    std::cout << "\nBatch of independent 3-species kinetics problems\n"
+              << "  number of batches = " << nbatches << "\n"
+              << "  linear solver     = "
+              << (solver_type ? "GMRES" : "KokkosKernels") << "\n"
+              << "  test type         = " << test_type << "\n"
+              << "  execution space   = " << ExecSpace().name() << "\n\n";
+
+    sunrealtype u0, v0, w0;
+    if (test_type == 0)
+    {
+      u0 = SUN_RCONST(3.9);
+      v0 = SUN_RCONST(1.1);
+      w0 = SUN_RCONST(2.8);
+
+      udata.a  = SUN_RCONST(1.2);
+      udata.b  = SUN_RCONST(2.5);
+      udata.ep = SUN_RCONST(1.0e-5);
+    }
+    else if (test_type == 1)
+    {
+      u0 = SUN_RCONST(3.0);
+      v0 = SUN_RCONST(3.0);
+      w0 = SUN_RCONST(3.5);
+
+      udata.a  = SUN_RCONST(0.5);
+      udata.b  = SUN_RCONST(3.0);
+      udata.ep = SUN_RCONST(5.0e-4);
+    }
+    else if (test_type == 2)
+    {
+      u0 = SUN_RCONST(1.2);
+      v0 = SUN_RCONST(3.1);
+      w0 = SUN_RCONST(3.0);
+
+      udata.a  = SUN_RCONST(1.0);
+      udata.b  = SUN_RCONST(3.5);
+      udata.ep = SUN_RCONST(5.0e-6);
+    }
+    else
+    {
+      std::cerr << "ERROR: Invalid test type option\n";
+      return -1;
+    }
+
+    // Create vector with the initial condition
+    const sunrealtype T0 = SUN_RCONST(0.0);
+
+    SizeType length{static_cast<SizeType>(batchSize * nbatches)};
+    VecType y{length, sunctx};
+    Vec2D y2d((y.View()).data(), nbatches, batchSize);
+
+    Kokkos::parallel_for(
+      "fill_y", Kokkos::RangePolicy<ExecSpace>(0, nbatches),
+      KOKKOS_LAMBDA(const SizeType i) {
+        y2d(i,0) = u0;
+        y2d(i,1) = v0;
+        y2d(i,2) = w0;
+      });
+
+    // Create vector of absolute tolerances
+    VecType abstol{length, sunctx};
+    N_VConst(SUN_RCONST(1.0e-10), abstol);
+
+    // Create CVODE using Backward Differentiation Formula methods
+    void* cvode_mem = CVodeCreate(CV_BDF, sunctx);
+    if (check_ptr(cvode_mem, "CVodeCreate")) { return 1; }
+
+    // Initialize the integrator and set the ODE right-hand side function
+    int retval = CVodeInit(cvode_mem, f, T0, y);
+    if (check_flag(retval, "CVodeInit")) { return 1; }
+
+    // Attach the user data structure
+    retval = CVodeSetUserData(cvode_mem, &udata);
+    if (check_flag(retval, "CVodeSetUserData")) { return 1; }
+
+    // Specify the scalar relative tolerance and vector absolute tolerances
+    retval = CVodeSVtolerances(cvode_mem, SUN_RCONST(1.0e-6), abstol);
+    if (check_flag(retval, "CVodeSVtolerances")) { return 1; }
+
+    // Create the matrix and linear solver objects
+    std::unique_ptr<sundials::ConvertibleTo<SUNMatrix>> A;
+    std::unique_ptr<sundials::ConvertibleTo<SUNLinearSolver>> LS;
+
+    if (solver_type == 0)
+    {
+      // Create Kokkos dense block diagonal matrix
+      A = std::make_unique<MatType>(nbatches, batchSize, batchSize, sunctx);
+
+      // Create Kokkos batched dense linear solver
+      LS = std::make_unique<LSType>(sunctx);
+
+      // Attach the matrix and linear solver to CVODE
+      retval = CVodeSetLinearSolver(cvode_mem, LS->Convert(), A->Convert());
+      if (check_flag(retval, "CVodeSetLinearSolver")) return 1;
+
+      // Set the user-supplied Jacobian function
+      retval = CVodeSetJacFn(cvode_mem, Jac);
+      if (check_flag(retval, "CVodeSetJacFn")) return 1;
+    }
+    else
+    {
+      // Create matrix-free GMRES linear solver
+      LS = std::make_unique<sundials::experimental::SUNLinearSolverView>(
+        SUNLinSol_SPGMR(y, SUN_PREC_NONE, 0, sunctx));
+
+      // Attach the linear solver to CVODE
+      retval = CVodeSetLinearSolver(cvode_mem, LS->Convert(), nullptr);
+      if (check_flag(retval, "CVodeSetLinearSolver")) return 1;
+    }
+
+    // Final time and time between outputs
+    const sunrealtype Tf    = SUN_RCONST(10.0);
+    const sunrealtype dTout = SUN_RCONST(1.0);
+
+    // Number of output times
+    const int Nt = static_cast<int>(ceil(Tf / dTout));
+
+    // Current time and first output time
+    sunrealtype t    = T0;
+    sunrealtype tout = T0 + dTout;
+
+    // Initial output
+    Vec2DHost y2d_h((y.HostView()).data(), nbatches, batchSize);
+    sundials::kokkos::CopyFromDevice(y);
+    Kokkos::fence();
+    std::cout << "At t = " << t << std::endl;
+    for (int j = 0; j < nbatches; j += 10)
+    {
+      std::cout << "  batch " << j << ": y = " << y2d_h(j,0) << " "
+                << y2d_h(j,1) << " " << y2d_h(j,2) << std::endl;
+    }
+
+    // Loop over output times
+    for (int iout = 0; iout < Nt; iout++)
+    {
+      // Advance in time
+      retval = CVode(cvode_mem, tout, y, &t, CV_NORMAL);
+      if (check_flag(retval, "CVode")) break;
+
+      // Output solution from some batches
+      sundials::kokkos::CopyFromDevice(y);
+      Kokkos::fence();
+      std::cout << "At t = " << t << std::endl;
+      for (int j = 0; j < nbatches; j += 10)
+      {
+        std::cout << "  batch " << j << ": y = " << y2d_h(j,0) << " "
+                  << y2d_h(j,1) << " " << y2d_h(j,2) << std::endl;
+      }
+
+      tout += dTout;
+      tout = (tout > Tf) ? Tf : tout;
+    }
+
+    // Print some final statistics
+    long int nst, nfe, nsetups, nje, nni, ncfn, netf;
+
+    retval = CVodeGetNumSteps(cvode_mem, &nst);
+    check_flag(retval, "CVodeGetNumSteps");
+    retval = CVodeGetNumRhsEvals(cvode_mem, &nfe);
+    check_flag(retval, "CVodeGetNumRhsEvals");
+    retval = CVodeGetNumLinSolvSetups(cvode_mem, &nsetups);
+    check_flag(retval, "CVodeGetNumLinSolvSetups");
+    retval = CVodeGetNumErrTestFails(cvode_mem, &netf);
+    check_flag(retval, "CVodeGetNumErrTestFails");
+    retval = CVodeGetNumNonlinSolvIters(cvode_mem, &nni);
+    check_flag(retval, "CVodeGetNumNonlinSolvIters");
+    retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncfn);
+    check_flag(retval, "CVodeGetNumNonlinSolvConvFails");
+    retval = CVodeGetNumJacEvals(cvode_mem, &nje);
+    check_flag(retval, "CVodeGetNumJacEvals");
+
+    std::cout << "\nFinal Statistics:\n"
+              << "  Steps            = " << nst << "\n"
+              << "  RHS evals        = " << nfe << "\n"
+              << "  LS setups        = " << nsetups << "\n"
+              << "  Jac evals        = " << nje << "\n"
+              << "  NLS iters        = " << nni << "\n"
+              << "  NLS fails        = " << ncfn << "\n"
+              << "  Error test fails = " << netf << "\n";
+
+    // Free objects
+    CVodeFree(&cvode_mem);
+  }
+  Kokkos::finalize();
+
+  return 0;
+}
+
+/* -----------------------------------------------------------------------------
+ * User-supplied functions called by CVODE
+ * ---------------------------------------------------------------------------*/
+
+// Right hand side function dy/dt = f(t,y)
+int f(sunrealtype t, N_Vector y, N_Vector ydot, void* user_data)
+{
+  auto udata = static_cast<UserData*>(user_data);
+
+  const auto nbatches  = udata->nbatches;
+  const auto batchSize = udata->batchSize;
+
+  const auto a  = udata->a;
+  const auto b  = udata->b;
+  const auto ep = udata->ep;
+
+  Vec2D y2d(N_VGetDeviceArrayPointer(y), nbatches, batchSize);
+  Vec2D ydot2d(N_VGetDeviceArrayPointer(ydot), nbatches, batchSize);
+
+  Kokkos::parallel_for(
+    "RHS", Kokkos::RangePolicy<ExecSpace>(0, nbatches),
+    KOKKOS_LAMBDA(const SizeType i) {
+      auto u = y2d(i,0);
+      auto v = y2d(i,1);
+      auto w = y2d(i,2);
+      ydot2d(i,0) = a - (w + ONE) * u + v * u * u;
+      ydot2d(i,1) = w * u - v * u * u;
+      ydot2d(i,2) = (b - w) / ep - w * u;
+    });
+
+  return 0;
+}
+
+// Jacobian of f(t,y)
+int Jac(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data,
+        N_Vector tmp1, N_Vector tmp2, N_Vector tmp3)
+{
+  auto udata  = static_cast<UserData*>(user_data);
+  auto y_data = sundials::kokkos::GetVec<VecType>(y)->View();
+  auto J_data = sundials::kokkos::GetDenseMat<MatType>(J)->View();
+
+  const auto nbatches  = udata->nbatches;
+  const auto batchSize = udata->batchSize;
+
+  const auto ep = udata->ep;
+  Vec2D y2d(N_VGetDeviceArrayPointer(y), nbatches, batchSize);
+
+  Kokkos::parallel_for(
+    "Jac", Kokkos::RangePolicy<ExecSpace>(0, nbatches),
+    KOKKOS_LAMBDA(const SizeType i) {
+      // get y values
+      auto u = y2d(i,0);
+      auto v = y2d(i,1);
+      auto w = y2d(i,2);
+
+      // first col of block
+      J_data(i, 0, 0) = -(w + ONE) + TWO * u * v;
+      J_data(i, 1, 0) = u * u;
+      J_data(i, 2, 0) = -u;
+
+      // second col of block
+      J_data(i, 0, 1) = u * u;
+      J_data(i, 1, 1) = -u * u;
+      J_data(i, 2, 1) = u;
+
+      // third col of block
+      J_data(i, 0, 2) = -w;
+      J_data(i, 1, 2) = ZERO;
+      J_data(i, 2, 2) = -ONE / ep - u;
+    });
+
+  return 0;
+}
diff --git a/examples/nvector/sycl/test_nvector_sycl.cpp b/examples/nvector/sycl/test_nvector_sycl.cpp
index 4ce143451c..b3b6f5c8dc 100644
--- a/examples/nvector/sycl/test_nvector_sycl.cpp
+++ b/examples/nvector/sycl/test_nvector_sycl.cpp
@@ -69,7 +69,7 @@ int main(int argc, char *argv[])
   SetTiming(print_timing, 0);
 
   /* Create an in-order GPU queue */
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else
diff --git a/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp b/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp
index 6c63f71e6e..c3adc0a08e 100644
--- a/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp
+++ b/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp
@@ -71,7 +71,7 @@ int main(int argc, char *argv[])
          (long int) cols, (long int) nblocks);
 
   // Create an in-order GPU queue
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else
diff --git a/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp b/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp
index 75d11de689..615e1ee9dd 100644
--- a/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp
+++ b/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp
@@ -80,7 +80,7 @@ int main(int argc, char *argv[])
          (long int) matrows, (long int) matcols);
 
   // Create an in-order GPU queue
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else
diff --git a/include/arkode/arkode_arkstep.h b/include/arkode/arkode_arkstep.h
index 7d624635c5..2b1ec96962 100644
--- a/include/arkode/arkode_arkstep.h
+++ b/include/arkode/arkode_arkstep.h
@@ -50,9 +50,11 @@ static const int ARKSTEP_DEFAULT_DIRK_4 = ARKODE_SDIRK_5_3_4;
 static const int ARKSTEP_DEFAULT_DIRK_5 = ARKODE_ARK548L2SA_DIRK_8_4_5;
 
 /*    ImEx */
+static const int ARKSTEP_DEFAULT_ARK_ETABLE_2 = ARKODE_ARK2_ERK_3_1_2;
 static const int ARKSTEP_DEFAULT_ARK_ETABLE_3 = ARKODE_ARK324L2SA_ERK_4_2_3;
 static const int ARKSTEP_DEFAULT_ARK_ETABLE_4 = ARKODE_ARK436L2SA_ERK_6_3_4;
 static const int ARKSTEP_DEFAULT_ARK_ETABLE_5 = ARKODE_ARK548L2SA_ERK_8_4_5;
+static const int ARKSTEP_DEFAULT_ARK_ITABLE_2 = ARKODE_ARK2_DIRK_3_1_2;
 static const int ARKSTEP_DEFAULT_ARK_ITABLE_3 = ARKODE_ARK324L2SA_DIRK_4_2_3;
 static const int ARKSTEP_DEFAULT_ARK_ITABLE_4 = ARKODE_ARK436L2SA_DIRK_6_3_4;
 static const int ARKSTEP_DEFAULT_ARK_ITABLE_5 = ARKODE_ARK548L2SA_DIRK_8_4_5;
diff --git a/include/arkode/arkode_butcher_dirk.h b/include/arkode/arkode_butcher_dirk.h
index 76f1d1cb7d..aecaf16a82 100644
--- a/include/arkode/arkode_butcher_dirk.h
+++ b/include/arkode/arkode_butcher_dirk.h
@@ -92,7 +92,8 @@ typedef enum {
   ARKODE_ESDIRK437L2SA_7_3_4,
   ARKODE_ESDIRK547L2SA_7_4_5,
   ARKODE_ESDIRK547L2SA2_7_4_5,
-  ARKODE_MAX_DIRK_NUM = ARKODE_ESDIRK547L2SA2_7_4_5
+  ARKODE_ARK2_DIRK_3_1_2,
+  ARKODE_MAX_DIRK_NUM = ARKODE_ARK2_DIRK_3_1_2
 } ARKODE_DIRKTableID;
 
 /* Accessor routine to load built-in DIRK table */
diff --git a/include/arkode/arkode_butcher_erk.h b/include/arkode/arkode_butcher_erk.h
index acd1d613fb..6673acb119 100644
--- a/include/arkode/arkode_butcher_erk.h
+++ b/include/arkode/arkode_butcher_erk.h
@@ -84,7 +84,8 @@ typedef enum {
   ARKODE_KNOTH_WOLKE_3_3,
   ARKODE_ARK437L2SA_ERK_7_3_4,
   ARKODE_ARK548L2SAb_ERK_8_4_5,
-  ARKODE_MAX_ERK_NUM = ARKODE_ARK548L2SAb_ERK_8_4_5
+  ARKODE_ARK2_ERK_3_1_2,
+  ARKODE_MAX_ERK_NUM = ARKODE_ARK2_ERK_3_1_2
 } ARKODE_ERKTableID;
 
 /* Accessor routine to load built-in ERK table */
diff --git a/include/nvector/nvector_kokkos.hpp b/include/nvector/nvector_kokkos.hpp
index 1b424a7d44..269d15dcef 100644
--- a/include/nvector/nvector_kokkos.hpp
+++ b/include/nvector/nvector_kokkos.hpp
@@ -650,6 +650,20 @@ void CopyFromDevice(VectorType& v)
   Kokkos::deep_copy(v.HostView(), v.View());
 }
 
+template<class VectorType, class view_type>
+view_type GetView(N_Vector v)
+{
+  auto vec{GetVec<VectorType>(v)};
+  return vec->View();
+}
+
+template<class VectorType, class host_view_type>
+host_view_type GetHostView(N_Vector v)
+{
+  auto vec{GetVec<VectorType>(v)};
+  return vec->HostView();
+}
+
 } // namespace kokkos
 } // namespace sundials
 
diff --git a/include/sundials/sundials_config.in b/include/sundials/sundials_config.in
index 4c79b40040..f3f66ff89c 100644
--- a/include/sundials/sundials_config.in
+++ b/include/sundials/sundials_config.in
@@ -130,12 +130,16 @@
  */
 #cmakedefine01 SUNDIALS_MPI_ENABLED
 
- /* SUPERLUMT threading type */
+/* oneMKL interface options */
+#cmakedefine SUNDIALS_ONEMKL_USE_GETRF_LOOP
+#cmakedefine SUNDIALS_ONEMKL_USE_GETRS_LOOP
+
+/* SUPERLUMT threading type */
 #define SUNDIALS_SUPERLUMT_THREAD_TYPE "@SUPERLUMT_THREAD_TYPE@"
 
- /* Trilinos with MPI is available, then
-  *    #define SUNDIALS_TRILINOS_HAVE_MPI
-  */
+/* Trilinos with MPI is available, then
+ *    #define SUNDIALS_TRILINOS_HAVE_MPI
+ */
 #cmakedefine SUNDIALS_TRILINOS_HAVE_MPI
 
 /* RAJA backends */
@@ -143,6 +147,10 @@
 #cmakedefine SUNDIALS_RAJA_BACKENDS_HIP
 #cmakedefine SUNDIALS_RAJA_BACKENDS_SYCL
 
+/* SYCL options */
+#cmakedefine SUNDIALS_SYCL_2020_UNSUPPORTED
+
+
 /* ------------------------------------------------------------------
  * SUNDIALS modules enabled
  * -----------------------------------------------------------------*/
diff --git a/scripts/cvode b/scripts/cvode
index 57261482ba..af97c77f74 100755
--- a/scripts/cvode
+++ b/scripts/cvode
@@ -221,6 +221,10 @@ $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.cpp
 $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.CUDA.out
 $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.OPENMP.out
 $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.SERIAL.out
+$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp
+$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out
+$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out
+$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out
 
 $tar $tarfile $distrobase/examples/cvode/magma/README
 $tar $tarfile $distrobase/examples/cvode/magma/CMakeLists.txt
diff --git a/src/arkode/arkode_arkstep.c b/src/arkode/arkode_arkstep.c
index ef38649f90..8b5bde3a1b 100644
--- a/src/arkode/arkode_arkstep.c
+++ b/src/arkode/arkode_arkstep.c
@@ -1924,6 +1924,9 @@ int arkStep_SetButcherTables(ARKodeMem ark_mem)
     switch (step_mem->q) {
 
     case(2):
+      etable = ARKSTEP_DEFAULT_ARK_ETABLE_2;
+      itable = ARKSTEP_DEFAULT_ARK_ITABLE_2;
+      break;
     case(3):
       etable = ARKSTEP_DEFAULT_ARK_ETABLE_3;
       itable = ARKSTEP_DEFAULT_ARK_ITABLE_3;
diff --git a/src/arkode/arkode_butcher_dirk.c b/src/arkode/arkode_butcher_dirk.c
index d1346acdd4..b74a543a52 100644
--- a/src/arkode/arkode_butcher_dirk.c
+++ b/src/arkode/arkode_butcher_dirk.c
@@ -66,8 +66,8 @@ ARKODE_DIRKTableID arkButcherTableDIRKNameToID(const char *imethod) {
 #undef ARK_BUTCHER_TABLE
 
   arkProcessError(NULL, ARK_ILL_INPUT, "ARKODE",
-	   "arkButcherTableDIRKNameToID",
-	   "Unknown Butcher table");
+                  "arkButcherTableDIRKNameToID",
+                  "Unknown Butcher table");
 
   return ARKODE_DIRK_NONE;
 }
diff --git a/src/arkode/arkode_butcher_dirk.def b/src/arkode/arkode_butcher_dirk.def
index f9e63023ab..a3cca75f66 100644
--- a/src/arkode/arkode_butcher_dirk.def
+++ b/src/arkode/arkode_butcher_dirk.def
@@ -57,7 +57,8 @@
      ARKODE_ARK548L2SAb_DIRK_8_4_5*   ESDIRK     Y         Y       N
      ARKODE_ESDIRK547L2SA_7_4_5       ESDIRK     Y         Y       N
      ARKODE_ESDIRK547L2SA2_7_4_5      ESDIRK     Y         Y       N
-    -----------------------------------------------------------------
+     ARKODE_ARK2_DIRK_3_1_2           ESDIRK     Y         Y       Y
+     -----------------------------------------------------------------
 */
 
 ARK_BUTCHER_TABLE(ARKODE_DIRK_NONE, {
@@ -68,22 +69,56 @@ ARK_BUTCHER_TABLE(ARKODE_SDIRK_2_1_2, { /* SDIRK-2-1 (A,B stable) */
     ARKodeButcherTable B = ARKodeButcherTable_Alloc(2, SUNTRUE);
     B->q = 2;
     B->p = 1;
-    
+
     B->A[0][0] = RCONST(1.0);
     B->A[1][0] = RCONST(-1.0);
     B->A[1][1] = RCONST(1.0);
-    
+
     B->b[0] = RCONST(0.5);
     B->b[1] = RCONST(0.5);
-    
+
     B->d[0] = RCONST(1.0);
 
-    
     B->c[0] = RCONST(1.0);
     B->c[1] = RCONST(0.0);
     return B;
   })
 
+ARK_BUTCHER_TABLE(ARKODE_ARK2_DIRK_3_1_2, { /* ARK2 Implicit Table (A,L stable) */
+    ARKodeButcherTable B = ARKodeButcherTable_Alloc(3, SUNTRUE);
+
+    /* 1 - 1 / sqrt(2) */
+    const sunrealtype gamma = SUN_RCONST(1.0) - SUN_RCONST(1.0) / SUNRsqrt(SUN_RCONST(2.0));
+    /* 1 / (2 sqrt(2)) */
+    const sunrealtype delta = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0)));
+    /* 2 - sqrt(2) */
+    const sunrealtype twogamma = SUN_RCONST(2.0) - SUNRsqrt(SUN_RCONST(2.0));
+    /* (4 - sqrt(2)) / 8 */
+    const sunrealtype beta = (SUN_RCONST(4.0) - SUNRsqrt(SUN_RCONST(2.0))) / SUN_RCONST(8.0);
+
+    B->q = 2;
+    B->p = 1;
+
+    B->A[1][0] = gamma;
+    B->A[1][1] = gamma;
+    B->A[2][0] = delta;
+    B->A[2][1] = delta;
+    B->A[2][2] = gamma;
+
+    B->b[0] = delta;
+    B->b[1] = delta;
+    B->b[2] = gamma;
+
+    B->d[0] = beta;
+    B->d[1] = beta;
+    B->d[2] = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0)));
+
+    B->c[1] = twogamma;
+    B->c[2] = SUN_RCONST(1.0);
+
+    return B;
+  })
+
 ARK_BUTCHER_TABLE(ARKODE_BILLINGTON_3_3_2, { /* Billington-SDIRK */
     ARKodeButcherTable B = ARKodeButcherTable_Alloc(3, SUNTRUE);
 
@@ -620,7 +655,7 @@ ARK_BUTCHER_TABLE(ARKODE_ESDIRK324L2SA_4_2_3, { /* ESDIRK3(2)4L[2]SA (A,L stable
     const sunrealtype g4 = g3 * g;
     const sunrealtype g5 = g4 * g;
     const sunrealtype c3 = RCONST(0.6);
-    
+
     ARKodeButcherTable B = ARKodeButcherTable_Alloc(4, SUNTRUE);
     B->q = 3;
     B->p = 2;
diff --git a/src/arkode/arkode_butcher_erk.def b/src/arkode/arkode_butcher_erk.def
index 43a8fe7210..f25fafb830 100644
--- a/src/arkode/arkode_butcher_erk.def
+++ b/src/arkode/arkode_butcher_erk.def
@@ -52,6 +52,7 @@
      ARKODE_ARK548L2SAb_ERK_8_4_5*       N
      ARKODE_VERNER_8_5_6                 Y
      ARKODE_FEHLBERG_13_7_8              Y
+     ARKODE_ARK2_ERK_3_1_2               Y
     --------------------------------
      ARKODE_KNOTH_WOLKE_3_3^             Y
     --------------------------------
@@ -75,7 +76,42 @@ ARK_BUTCHER_TABLE(ARKODE_HEUN_EULER_2_1_2, { /* Heun-Euler-ERK */
 
     B->c[1] = RCONST(1.0);
     return B;
- }) 
+ })
+
+ARK_BUTCHER_TABLE(ARKODE_ARK2_ERK_3_1_2, { /* ARK2 Explicit Table */
+    ARKodeButcherTable B = ARKodeButcherTable_Alloc(3, SUNTRUE);
+
+    /* 1 - 1 / sqrt(2) */
+    const sunrealtype gamma = SUN_RCONST(1.0) - SUN_RCONST(1.0) / SUNRsqrt(SUN_RCONST(2.0));
+    /* (3 + 2 sqrt(2)) / 6 */
+    const sunrealtype alpha = (SUN_RCONST(3.0) + SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0))) / SUN_RCONST(6.0);
+    /* 1 / (2 sqrt(2)) */
+    const sunrealtype delta = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0)));
+    /* 2 - sqrt(2) */
+    const sunrealtype twogamma = SUN_RCONST(2.0) - SUNRsqrt(SUN_RCONST(2.0));
+    /* (4 - sqrt(2)) / 8 */
+    const sunrealtype beta = (SUN_RCONST(4.0) - SUNRsqrt(SUN_RCONST(2.0))) / SUN_RCONST(8.0);
+
+    B->q = 2;
+    B->p = 1;
+
+    B->A[1][0] = twogamma;
+    B->A[2][0] = SUN_RCONST(1.0) - alpha;
+    B->A[2][1] = alpha;
+
+    B->b[0] = delta;
+    B->b[1] = delta;
+    B->b[2] = gamma;
+
+    B->d[0] = beta;
+    B->d[1] = beta;
+    B->d[2] = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0)));
+
+    B->c[1] = twogamma;
+    B->c[2] = SUN_RCONST(1.0);
+
+    return B;
+  })
 
 ARK_BUTCHER_TABLE(ARKODE_BOGACKI_SHAMPINE_4_2_3, { /* Bogacki-Shampine-ERK */
     ARKodeButcherTable B = ARKodeButcherTable_Alloc(4, SUNTRUE);
@@ -684,4 +720,3 @@ ARK_BUTCHER_TABLE(ARKODE_KNOTH_WOLKE_3_3, { /* Knoth-Wolke-ERK */
     B->c[2] = RCONST(3.0)/RCONST(4.0);
     return B;
   })
-
diff --git a/src/cvode/cvode_ls.c b/src/cvode/cvode_ls.c
index a32679ab3c..a9d986c193 100644
--- a/src/cvode/cvode_ls.c
+++ b/src/cvode/cvode_ls.c
@@ -1636,7 +1636,8 @@ int cvLsSolve(CVodeMem cv_mem, N_Vector b, N_Vector weight,
               N_Vector ynow, N_Vector fnow)
 {
   CVLsMem  cvls_mem;
-  realtype bnorm, deltar, delta, w_mean;
+  realtype bnorm = ZERO;
+  realtype deltar, delta, w_mean;
   int      curiter, nli_inc, retval;
 #if SUNDIALS_LOGGING_LEVEL >= SUNDIALS_LOGGING_DEBUG
   realtype resnorm;
diff --git a/src/cvodes/cvodes_ls.c b/src/cvodes/cvodes_ls.c
index b7477bebca..14d12a9bc6 100644
--- a/src/cvodes/cvodes_ls.c
+++ b/src/cvodes/cvodes_ls.c
@@ -1723,7 +1723,8 @@ int cvLsSolve(CVodeMem cv_mem, N_Vector b, N_Vector weight,
               N_Vector ynow, N_Vector fnow)
 {
   CVLsMem  cvls_mem;
-  realtype bnorm, deltar, delta, w_mean;
+  realtype bnorm = ZERO;
+  realtype deltar, delta, w_mean;
   int      curiter, nli_inc, retval;
   booleantype do_sensi_sim, do_sensi_stg, do_sensi_stg1;
 #if SUNDIALS_LOGGING_LEVEL >= SUNDIALS_LOGGING_DEBUG
diff --git a/src/sundials/sundials_futils.c b/src/sundials/sundials_futils.c
index 194a917ae2..7d32c597db 100644
--- a/src/sundials/sundials_futils.c
+++ b/src/sundials/sundials_futils.c
@@ -15,15 +15,37 @@
  * -----------------------------------------------------------------*/
 
 #include <sundials/sundials_futils.h>
+#include <string.h>
 
 /* Create a file pointer with the given file name and mode. */
 FILE* SUNDIALSFileOpen(const char* filename, const char* mode)
 {
-  return fopen(filename, mode);
+  FILE* fp = NULL;
+
+  if (filename)
+  {
+    if (!strcmp(filename, "stdout"))
+    {
+      fp = stdout;
+    }
+    else if (!strcmp(filename, "stderr"))
+    {
+      fp = stderr;
+    }
+    else
+    {
+      fp = fopen(filename, mode);
+    }
+  }
+
+  return fp;
 }
 
 /* Close a file pointer with the given file name. */
 void SUNDIALSFileClose(FILE* fp)
 {
-  fclose(fp);
+  if (fp && (fp != stdout) && (fp != stderr))
+  {
+    fclose(fp);
+  }
 }
diff --git a/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp b/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp
index d1c7165ed0..72e87f53f9 100644
--- a/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp
+++ b/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp
@@ -180,6 +180,13 @@ SUNLinearSolver SUNLinSol_OneMklDense(N_Vector y, SUNMatrix Amat, SUNContext sun
 
   if (num_blocks > 1)
   {
+#ifdef SUNDIALS_ONEMKL_USE_GETRF_LOOP
+    LS_F_SCRATCH_SIZE(S) =
+      getrf_scratchpad_size<realtype>(*queue, // device queue
+                                      M,      // rows in A_i
+                                      N,      // columns in A_i
+                                      M);     // leading dimension
+#else
     LS_F_SCRATCH_SIZE(S) =
       getrf_batch_scratchpad_size<realtype>(*queue,      // device queue
                                             M,           // rows in A_i
@@ -188,8 +195,17 @@ SUNLinearSolver SUNLinSol_OneMklDense(N_Vector y, SUNMatrix Amat, SUNContext sun
                                             M * N,       // stride between A_i
                                             M,           // stride in P_i
                                             num_blocks); // number of blocks
+#endif
 
-#ifdef SUNDIALS_ONEMKL_USE_GETRS_BATCHED
+#ifdef SUNDIALS_ONEMKL_USE_GETRS_LOOP
+    LS_S_SCRATCH_SIZE(S) =
+      getrs_scratchpad_size<realtype>(*queue,  // device queue
+                                      oneapi::mkl::transpose::nontrans,
+                                      M,      // number of rows in A
+                                      1,      // number of right-hand sizes
+                                      M,      // leading dimension of A
+                                      M);     // leading dimension of B
+#else
     LS_S_SCRATCH_SIZE(S)=
       getrs_batch_scratchpad_size<realtype>(*queue,      // device queue
                                             oneapi::mkl::transpose::nontrans,
@@ -201,14 +217,6 @@ SUNLinearSolver SUNLinSol_OneMklDense(N_Vector y, SUNMatrix Amat, SUNContext sun
                                             M,           // leading dimension of B_i
                                             M,           // stride between B_i
                                             num_blocks); // number of blocks
-#else
-    LS_S_SCRATCH_SIZE(S) =
-      getrs_scratchpad_size<realtype>(*queue,  // device queue
-                                      oneapi::mkl::transpose::nontrans,
-                                      M,      // number of rows in A
-                                      1,      // number of right-hand sizes
-                                      M,      // leading dimension of A
-                                      M);     // leading dimension of B
 #endif
   }
   else
@@ -326,6 +334,36 @@ int SUNLinSolSetup_OneMklDense(SUNLinearSolver S, SUNMatrix A)
 
   if (num_blocks > 1)
   {
+#ifdef SUNDIALS_ONEMKL_USE_GETRF_LOOP
+    try
+    {
+      for (sunindextype i = 0; i < num_blocks; i++)
+      {
+        getrf(*queue,            // device queue
+              M,                 // number of rows
+              N,                 // number of columns
+              Adata + i * M * N, // matrix data
+              M,                 // leading dimension of A
+              pivots + i * M,    // array of pivots
+              scratchpad,        // scratchpad memory
+              scratch_size);     // scratchpad size
+      }
+    }
+    catch(oneapi::mkl::lapack::exception const& e)
+    {
+      SUNDIALS_DEBUG_ERROR("An exception occured in getrf\n");
+      if (e.info())
+      {
+        // An illegal value was providied or the scratch pad is too small
+        ier = -1;
+      }
+      else
+      {
+        // The diagonal element of some of U_i is zero
+        ier = 1;
+      }
+    }
+#else
     try
     {
       getrf_batch(*queue,         // device queue
@@ -354,6 +392,7 @@ int SUNLinSolSetup_OneMklDense(SUNLinearSolver S, SUNMatrix A)
         ier = 1;
       }
     }
+#endif
   }
   else
   {
@@ -467,7 +506,30 @@ int SUNLinSolSolve_OneMklDense(SUNLinearSolver S, SUNMatrix A, N_Vector x,
 
   if (num_blocks > 1)
   {
-#ifdef SUNDIALS_ONEMKL_USE_GETRS_BATCHED
+#ifdef SUNDIALS_ONEMKL_USE_GETRS_LOOP
+    try
+    {
+      for (sunindextype i = 0; i < num_blocks; i++)
+      {
+        getrs(*queue,            // device queue
+              oneapi::mkl::transpose::nontrans,
+              M,                 // number of rows
+              1,                 // number of right-hand sides
+              Adata + i * M * N, // factorized matrix data
+              M,                 // leading dimension of A
+              pivots,            // array of pivots
+              xdata + i * M,     // right-hand side data
+              M,                 // leading dimension of B_i
+              scratchpad,        // scratchpad memory
+              scratch_size);     // scratchpad size
+      }
+    }
+    catch(oneapi::mkl::lapack::exception const& e)
+    {
+      SUNDIALS_DEBUG_ERROR("An exception occured in getrs\n");
+      ier = -1;
+    }
+#else
     try
     {
       getrs_batch(*queue,        // device queue
@@ -491,29 +553,6 @@ int SUNLinSolSolve_OneMklDense(SUNLinearSolver S, SUNMatrix A, N_Vector x,
       SUNDIALS_DEBUG_ERROR("An exception occured in getrs_batch\n");
       ier = -1;
     }
-#else
-    try
-    {
-      for (sunindextype i = 0; i < num_blocks; i++)
-      {
-        getrs(*queue,            // device queue
-              oneapi::mkl::transpose::nontrans,
-              M,                 // number of rows
-              1,                 // number of right-hand sides
-              Adata + i * M * N, // factorized matrix data
-              M,                 // leading dimension of A
-              pivots,            // array of pivots
-              xdata + i * M,     // right-hand side data
-              M,                 // leading dimension of B_i
-              scratchpad,        // scratchpad memory
-              scratch_size);     // scratchpad size
-      }
-    }
-    catch(oneapi::mkl::lapack::exception const& e)
-    {
-      SUNDIALS_DEBUG_ERROR("An exception occured in getrs\n");
-      ier = -1;
-    }
 #endif
   }
   else
diff --git a/test/answers b/test/answers
index 72fd01e63e..96d6e170c1 160000
--- a/test/answers
+++ b/test/answers
@@ -1 +1 @@
-Subproject commit 72fd01e63edeffe39a800c820ac8aa8447270bf7
+Subproject commit 96d6e170c15f997d1e9062d4e6478e618d3f30ca
diff --git a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp
index d89ecacc39..8ad033603d 100644
--- a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp
+++ b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp
@@ -29,13 +29,13 @@
 int main() {
 
   // set vectors of individual tables to test
-  std::vector<std::string> Tables_ERK = {"ARKODE_HEUN_EULER_2_1_2",
+  std::vector<std::string> Tables_ERK = {"ARKODE_HEUN_EULER_2_1_2", "ARKODE_ARK2_ERK_3_1_2",
     "ARKODE_BOGACKI_SHAMPINE_4_2_3", "ARKODE_ARK324L2SA_ERK_4_2_3", "ARKODE_ZONNEVELD_5_3_4",
     "ARKODE_ARK436L2SA_ERK_6_3_4", "ARKODE_SAYFY_ABURUB_6_3_4", "ARKODE_CASH_KARP_6_4_5",
     "ARKODE_FEHLBERG_6_4_5", "ARKODE_DORMAND_PRINCE_7_4_5", "ARKODE_ARK548L2SA_ERK_8_4_5",
     "ARKODE_VERNER_8_5_6", "ARKODE_FEHLBERG_13_7_8", "ARKODE_ARK437L2SA_ERK_7_3_4",
     "ARKODE_ARK548L2SAb_ERK_8_4_5"};
-  std::vector<std::string> Tables_DIRK = {"ARKODE_SDIRK_2_1_2",
+  std::vector<std::string> Tables_DIRK = {"ARKODE_SDIRK_2_1_2", "ARKODE_ARK2_DIRK_3_1_2",
     "ARKODE_BILLINGTON_3_3_2", "ARKODE_TRBDF2_3_3_2", "ARKODE_KVAERNO_4_2_3",
     "ARKODE_ARK324L2SA_DIRK_4_2_3", "ARKODE_CASH_5_2_4", "ARKODE_CASH_5_3_4",
     "ARKODE_SDIRK_5_3_4", "ARKODE_KVAERNO_5_3_4", "ARKODE_ARK436L2SA_DIRK_6_3_4",
@@ -44,15 +44,15 @@ int main() {
     "ARKODE_ESDIRK324L2SA_4_2_3", "ARKODE_ESDIRK325L2SA_5_2_3", "ARKODE_ESDIRK32I5L2SA_5_2_3",
     "ARKODE_ESDIRK436L2SA_6_3_4", "ARKODE_ESDIRK43I6L2SA_6_3_4", "ARKODE_QESDIRK436L2SA_6_3_4",
     "ARKODE_ESDIRK437L2SA_7_3_4", "ARKODE_ESDIRK547L2SA_7_4_5", "ARKODE_ESDIRK547L2SA2_7_4_5"};
-  std::vector<ARKODE_ERKTableID> Tables_ARK_ERK = {ARKODE_ARK324L2SA_ERK_4_2_3,
+  std::vector<ARKODE_ERKTableID> Tables_ARK_ERK = {ARKODE_ARK2_ERK_3_1_2, ARKODE_ARK324L2SA_ERK_4_2_3,
     ARKODE_ARK436L2SA_ERK_6_3_4, ARKODE_ARK437L2SA_ERK_7_3_4, ARKODE_ARK548L2SA_ERK_8_4_5,
     ARKODE_ARK548L2SAb_ERK_8_4_5};
-  std::vector<ARKODE_DIRKTableID> Tables_ARK_DIRK = {ARKODE_ARK324L2SA_DIRK_4_2_3,
+  std::vector<ARKODE_DIRKTableID> Tables_ARK_DIRK = {ARKODE_ARK2_DIRK_3_1_2, ARKODE_ARK324L2SA_DIRK_4_2_3,
     ARKODE_ARK436L2SA_DIRK_6_3_4, ARKODE_ARK437L2SA_DIRK_7_3_4, ARKODE_ARK548L2SA_DIRK_8_4_5,
     ARKODE_ARK548L2SAb_DIRK_8_4_5};
-  std::vector<std::string> STables_ARK = {"ARKODE_ARK324L2SA_4_2_3", "ARKODE_ARK436L2SA_6_3_4",
-                                          "ARKODE_ARK437L2SA_7_3_4", "ARKODE_ARK548L2SA_8_4_5",
-                                          "ARKODE_ARK548L2SAb_8_4_5"};
+  std::vector<std::string> STables_ARK = {"ARKODE_ARK2_3_1_2", "ARKODE_ARK324L2SA_4_2_3",
+                                          "ARKODE_ARK436L2SA_6_3_4", "ARKODE_ARK437L2SA_7_3_4",
+                                          "ARKODE_ARK548L2SA_8_4_5", "ARKODE_ARK548L2SAb_8_4_5"};
   int numfails = 0;
 
   // loop over individual ERK tables
diff --git a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out
index 054defea65..8d7971338b 100644
--- a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out
+++ b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out
@@ -2,6 +2,7 @@
 Testing individual ERK methods:
 
 Testing method ARKODE_HEUN_EULER_2_1_2:  table matches predicted method/embedding orders of 2/1
+Testing method ARKODE_ARK2_ERK_3_1_2:  table matches predicted method/embedding orders of 2/1
 Testing method ARKODE_BOGACKI_SHAMPINE_4_2_3:  table matches predicted method/embedding orders of 3/2
 Testing method ARKODE_ARK324L2SA_ERK_4_2_3:  table matches predicted method/embedding orders of 3/2
 Testing method ARKODE_ZONNEVELD_5_3_4:  table matches predicted method/embedding orders of 4/3
@@ -25,6 +26,7 @@ Testing method ARKODE_ARK548L2SAb_ERK_8_4_5:  table matches predicted method/emb
 Testing individual DIRK methods:
 
 Testing method ARKODE_SDIRK_2_1_2:  table matches predicted method/embedding orders of 2/1
+Testing method ARKODE_ARK2_DIRK_3_1_2:  table matches predicted method/embedding orders of 2/1
 Testing method ARKODE_BILLINGTON_3_3_2:  table matches predicted method/embedding orders of 2/3
 Testing method ARKODE_TRBDF2_3_3_2:  table matches predicted method/embedding orders of 2/3
 Testing method ARKODE_KVAERNO_4_2_3:  table matches predicted method/embedding orders of 3/2
@@ -50,6 +52,7 @@ Testing method ARKODE_ESDIRK547L2SA2_7_4_5:  table matches predicted method/embe
 
 Testing ARK pairs:
 
+Testing method ARKODE_ARK2_3_1_2:  Method/embedding match predicted orders of 2/1
 Testing method ARKODE_ARK324L2SA_4_2_3:  Method/embedding match predicted orders of 3/2
 Testing method ARKODE_ARK436L2SA_6_3_4:  Method/embedding match predicted orders of 4/3
 Testing method ARKODE_ARK437L2SA_7_3_4:  Method/embedding match predicted orders of 4/3
diff --git a/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp b/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp
index 325d8654d1..52e1dab5dd 100644
--- a/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp
+++ b/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp
@@ -20,7 +20,7 @@ int test_instance(SUNMemoryHelper helper, SUNMemoryType mem_type,
                   bool print_test_status)
 {
   // Create an in-order GPU queue
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else