diff --git a/CHANGELOG.md b/CHANGELOG.md
index 44eb259cbf..6dbf013aef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,22 @@
 # SUNDIALS Changelog
 
+## Changes to SUNDIALS in release 6.6.0
+
+Added the second order IMEX method from Giraldo, Kelly, and Constantinescu 2013
+as the default second order IMEX method in ARKStep. The explicit table is given
+by `ARKODE_ARK2_ERK_3_1_2` and the implicit table by `ARKODE_ARK2_DIRK_3_1_2`.
+
+Updated the F2003 utility routines `SUNDIALSFileOpen` and `SUNDIALSFileClose`
+to support user specification of `stdout` and `stderr` strings for the output
+file names.
+
+Updated CVODE, CVODES and ARKODE default behavior when returning the solution when
+the internal time has reached a user-specified stop time.  Previously, the output
+solution was interpolated to the value of `tstop`; the default is now to copy the
+internal solution vector.  Users who wish to revert to interpolation may call a new
+routine `CVodeSetInterpolateStopTime`, `ARKStepSetInterpolateStopTime`,
+`ERKStepSetInterpolateStopTime`, or `MRIStepSetInterpolateStopTime`.
+
 ## Changes to SUNDIALS in release 6.5.1
 
 Added the functions `ARKStepClearStopTime`, `ERKStepClearStopTime`,
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 793ffbb0a4..e9fd4648c1 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -26,12 +26,9 @@ sundials_option(BENCHMARK_NVECTOR BOOL "NVector benchmarks are on" ON)
 # Add specific benchmarks
 #----------------------------------------
 
-if(ENABLE_MPI AND ENABLE_RAJA)
-  add_subdirectory(advection_reaction_3D)
-endif()
-
 if(ENABLE_MPI)
-  add_subdirectory(diffusion_2D)
+add_subdirectory(diffusion_2D)
+add_subdirectory(advection_reaction_3D)
 endif()
 
 # Add the nvector benchmarks
diff --git a/benchmarks/advection_reaction_3D/CMakeLists.txt b/benchmarks/advection_reaction_3D/CMakeLists.txt
index e51a95155a..7469a6a10a 100644
--- a/benchmarks/advection_reaction_3D/CMakeLists.txt
+++ b/benchmarks/advection_reaction_3D/CMakeLists.txt
@@ -1,5 +1,5 @@
 # ---------------------------------------------------------------
-# Programmer(s):  Cody J. Balos @ LLNL
+# Programmer(s): Daniel R. Reynolds @ SMU
 # ---------------------------------------------------------------
 # SUNDIALS Copyright Start
 # Copyright (c) 2002-2023, Lawrence Livermore National Security
@@ -12,135 +12,10 @@
 # SUNDIALS Copyright End
 # ---------------------------------------------------------------
 
-if(BUILD_ARKODE AND BUILD_CVODE AND BUILD_IDA)
-
-  if((RAJA_BACKENDS MATCHES "TARGET_OPENMP") OR (RAJA_BACKENDS MATCHES "OPENMP"))
-    set(OTHER_LIBS OpenMP::OpenMP_CXX)
-  endif()
-
-  # ----------------------------------------------------------------------------
-  # MPI only
-  # ----------------------------------------------------------------------------
-
-  add_executable(advection_reaction_3D
-    advection_reaction_3D.cpp
-    arkode_driver.cpp
-    cvode_driver.cpp
-    ida_driver.cpp
-    rhs3D.hpp
-    ParallelGrid.hpp
-    backends.hpp)
-
-  # ensure the linker language is reset to CXX
-  set_target_properties(advection_reaction_3D PROPERTIES LINKER_LANGUAGE CXX)
-
-  target_include_directories(advection_reaction_3D
-    PRIVATE
-    ${PROJECT_SOURCE_DIR}/utilities
-    ${MPI_CXX_INCLUDE_DIRS})
-
-  target_link_libraries(advection_reaction_3D
-    PRIVATE
-    sundials_arkode
-    sundials_cvode
-    sundials_ida
-    sundials_nvecmpiplusx
-    sundials_nvecserial
-    RAJA
-    ${MPI_CXX_LIBRARIES}
-    ${OTHER_LIBS})
-
-  install(TARGETS advection_reaction_3D
-    DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D")
-
-  install(FILES README.md
-    DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D")
-
-  # ----------------------------------------------------------------------------
-  # MPI + CUDA
-  # ----------------------------------------------------------------------------
-
-  if(BUILD_NVECTOR_CUDA)
-
-    set_source_files_properties(advection_reaction_3D.cpp
-      PROPERTIES LANGUAGE CUDA)
-    set_source_files_properties(arkode_driver.cpp PROPERTIES LANGUAGE CUDA)
-    set_source_files_properties(cvode_driver.cpp PROPERTIES LANGUAGE CUDA)
-    set_source_files_properties(ida_driver.cpp PROPERTIES LANGUAGE CUDA)
-
-    add_executable(advection_reaction_3D_mpicuda
-      advection_reaction_3D.cpp
-      arkode_driver.cpp
-      cvode_driver.cpp
-      ida_driver.cpp
-      rhs3D.hpp
-      ParallelGrid.hpp
-      backends.hpp)
-
-    # ensure the linker language is reset to CXX
-    set_target_properties(advection_reaction_3D_mpicuda
-      PROPERTIES LINKER_LANGUAGE CXX)
-
-    target_include_directories(advection_reaction_3D_mpicuda
-      PRIVATE
-      ${PROJECT_SOURCE_DIR}/utilities
-      ${MPI_CXX_INCLUDE_DIRS})
-
-    target_link_libraries(advection_reaction_3D_mpicuda
-      PRIVATE
-      sundials_arkode
-      sundials_cvode
-      sundials_ida
-      sundials_nvecmpiplusx
-      sundials_nveccuda
-      RAJA
-      ${MPI_CXX_LIBRARIES}
-      ${OTHER_LIBS})
-
-    target_compile_definitions(advection_reaction_3D_mpicuda PRIVATE USE_CUDA_NVEC)
-
-    install(TARGETS advection_reaction_3D_mpicuda
-      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D")
-
-  endif()
-
-  # ----------------------------------------------------------------------------
-  # MPI + HIP
-  # ----------------------------------------------------------------------------
-
-  if(BUILD_NVECTOR_HIP)
-
-    add_executable(advection_reaction_3D_mpihip
-      advection_reaction_3D.cpp
-      arkode_driver.cpp
-      cvode_driver.cpp
-      ida_driver.cpp
-      rhs3D.hpp
-      ParallelGrid.hpp
-      backends.hpp)
-
-    target_include_directories(advection_reaction_3D_mpihip
-      PRIVATE
-      ${PROJECT_SOURCE_DIR}/utilities
-      ${MPI_CXX_INCLUDE_DIRS})
-
-    target_link_libraries(advection_reaction_3D_mpihip
-      PRIVATE
-      sundials_arkode
-      sundials_cvode
-      sundials_ida
-      sundials_nvecmpiplusx
-      sundials_nvechip
-      RAJA
-      hip::device
-      ${MPI_CXX_LIBRARIES}
-      ${OTHER_LIBS})
-
-    target_compile_definitions(advection_reaction_3D_mpihip PRIVATE USE_HIP_NVEC)
-
-    install(TARGETS advection_reaction_3D_mpihip
-      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D")
-
-  endif()
+if(ENABLE_RAJA)
+  add_subdirectory(raja)
+endif()
 
+if(ENABLE_KOKKOS AND BUILD_NVECTOR_KOKKOS)
+  add_subdirectory(kokkos)
 endif()
diff --git a/benchmarks/advection_reaction_3D/kokkos/CMakeLists.txt b/benchmarks/advection_reaction_3D/kokkos/CMakeLists.txt
new file mode 100644
index 0000000000..2d58e5fe4c
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/CMakeLists.txt
@@ -0,0 +1,61 @@
+# ---------------------------------------------------------------
+# Programmer(s):  Daniel R. Reynolds @ SMU
+# ---------------------------------------------------------------
+# SUNDIALS Copyright Start
+# Copyright (c) 2002-2023, Lawrence Livermore National Security
+# and Southern Methodist University.
+# All rights reserved.
+#
+# See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# SUNDIALS Copyright End
+# ---------------------------------------------------------------
+
+# Add the build targets for each backend
+if(BUILD_ARKODE AND BUILD_CVODE AND BUILD_IDA)
+  foreach(backend ${KOKKOS_EXAMPLES_BACKENDS})
+
+    # set benchmark target name
+    set(benchmark_target "advection_reaction_3D_kokkos.${backend}")
+
+    # benchmark source files
+    add_executable(${benchmark_target}
+      advection_reaction_3D.cpp
+      arkode_driver.cpp
+      cvode_driver.cpp
+      ida_driver.cpp
+      rhs3D.hpp
+      ParallelGrid.hpp
+      check_retval.h)
+
+      # which backend to use
+      target_compile_definitions(${benchmark_target} PRIVATE USE_${backend})
+
+      # directories to include
+      target_include_directories(${benchmark_target}
+        PRIVATE
+        ${PROJECT_SOURCE_DIR}/utilities
+        ${MPI_CXX_INCLUDE_DIRS}
+      )
+
+      # libraries to link against
+      target_link_libraries(${benchmark_target}
+        PRIVATE
+        sundials_arkode
+        sundials_cvode
+        sundials_ida
+        sundials_nvecmpiplusx
+        sundials_nveckokkos
+        ${MPI_CXX_LIBRARIES}
+        ${EXE_EXTRA_LINK_LIBS}
+      )
+
+    install(TARGETS ${benchmark_target}
+      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/kokkos")
+
+    install(FILES README.md ../scripts/compare_error.py ../scripts/compute_error.py ../scripts/pickle_solution_output.py
+      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/kokkos")
+
+  endforeach()
+endif()
diff --git a/benchmarks/advection_reaction_3D/kokkos/ParallelGrid.hpp b/benchmarks/advection_reaction_3D/kokkos/ParallelGrid.hpp
new file mode 100644
index 0000000000..c324105b02
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/ParallelGrid.hpp
@@ -0,0 +1,593 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Daniel R. Reynolds @ SMU
+ *                Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------
+ * A simple implementation of a parallel structured Cartesian mesh class that
+ * supports up to 3 spatial dimensions and an arbitrary number of degrees of
+ * freedom, and that uses Kokkos views to store communication buffer data.
+ * ----------------------------------------------------------------------------*/
+
+#ifndef _KOKKOSPARGRID_H
+#define _KOKKOSPARGRID_H
+
+#include <iomanip>
+#include <iostream>
+#include <fstream>
+#include <mpi.h>
+#include <Kokkos_Core.hpp>
+#include <sundials/sundials_types.h>
+
+
+/* Set Kokkos execution space and type shortcuts */
+#if defined(USE_CUDA)
+using ExecSpace = Kokkos::Cuda;
+using MemSpace  = Kokkos::CudaSpace;
+#elif defined(USE_HIP)
+#if KOKKOS_VERSION / 10000 > 3
+using ExecSpace = Kokkos::HIP;
+using MemSpace  = Kokkos::HIPSpace;
+#else
+using ExecSpace = Kokkos::Experimental::HIP;
+using MemSpace  = Kokkos::Experimental::HIPSpace;
+#endif
+#elif defined(USE_OPENMP)
+using ExecSpace = Kokkos::OpenMP;
+using MemSpace  = Kokkos::HostSpace;
+#else
+using ExecSpace = Kokkos::Serial;
+using MemSpace  = Kokkos::HostSpace;
+#endif
+using Vec1D = Kokkos::View<realtype*, MemSpace>;
+using Vec4D = Kokkos::View<realtype****, MemSpace>;
+using Vec1DHost = Vec1D::HostMirror;
+using Vec4DHost = Vec4D::HostMirror;
+using Range3D = Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>>;
+
+
+namespace sundials_tools
+{
+
+// Types of boundaries supported.
+enum class BoundaryType
+{
+  PERIODIC
+};
+
+// Types of stencils supported.
+enum class StencilType
+{
+  UPWIND
+};
+
+template<typename GLOBALINT>
+class ParallelGrid
+{
+public:
+  // Constructor that creates a new ParallelGrid object.
+  // [in] - the memory helper to use for allocating the MPI buffers
+  // [in,out] comm - on input, the overal MPI communicator, on output, the cartesian communicator
+  // [in] a[] - an array of length 3 which defines the domain [a,b]
+  // [in] b[] - an array of length 3 which defines the domain [a,b]
+  // [in] npts[] - an array of length 3 which defines the number of mesh points in each dimension
+  // [in] dof - the number of degrees of freedom in each dimension
+  // [in] bc - the type of boundary conditions (see BoundaryType)
+  // [in] st - the stencil to use (see StencilType)
+  // [in] npxyz - the number of processors in each dimension; defaults to 0 which means MPI will choose
+  // [in] reorder - should MPI_Cart_create do process reordering to optimize or not; defaults to false (some MPI implementations ignore this)
+  ParallelGrid(MPI_Comm* comm, const realtype a[], const realtype b[], const GLOBALINT npts[],
+               int dof, BoundaryType bc, StencilType st, const realtype c,
+               const int npxyz[] = nullptr, bool reorder = false)
+    : nx(1), ny(1), nz(1),
+      nxl(1), nyl(1), nzl(1),
+      npx(1), npy(1), npz(1),
+      dx(0.0), dy(0.0), dz(0.0),
+      ax(0.0), ay(0.0), az(0.0),
+      bx(0.0), by(0.0), bz(0.0),
+      dof(dof), dims{0,0,0}, coords{0,0,0},
+      bc(bc), st(st), upwindRight(true)
+  {
+    assert(st == StencilType::UPWIND);
+
+    /* Set up MPI Cartesian communicator */
+    if (npxyz)
+    {
+      dims[0] = npxyz[0];
+      dims[1] = npxyz[1];
+      dims[2] = npxyz[2];
+    }
+
+    int retval, nprocs;
+    MPI_Comm_size(*comm, &nprocs);
+    retval = MPI_Dims_create(nprocs, 3, dims);
+    assert(retval == MPI_SUCCESS);
+
+    int periods[] = { bc == BoundaryType::PERIODIC,
+                      bc == BoundaryType::PERIODIC,
+                      bc == BoundaryType::PERIODIC };
+    retval = MPI_Cart_create(*comm, 3, dims, periods, reorder, comm);
+    assert(retval == MPI_SUCCESS);
+
+    retval = MPI_Cart_get(*comm, 3, dims, periods, coords);
+    assert(retval == MPI_SUCCESS);
+
+    cart_comm = *comm;
+
+    /* Set upwinding direction */
+    upwindRight = (c > 0.0);
+
+    /* Set up information for the first spatial dimension */
+    npx = dims[0];
+    nx  = npts[0];
+    ax  = a[0];
+    bx  = b[0];
+    dx  = (bx-ax) / (realtype) nx;
+    int is = nx*(coords[0])/npx;
+    int ie = nx*(coords[0]+1)/npx-1;
+    nxl = ie-is+1;
+    neq = dof * nxl;
+
+    /* Set up information for the second spatial dimension */
+    npy = dims[1];
+    ny  = npts[1];
+    ay  = a[1];
+    by  = b[1];
+    dy  = (by-ay) / (realtype) ny;
+    int js = ny*(coords[1])/npy;
+    int je = ny*(coords[1]+1)/npy-1;
+    nyl = je-js+1;
+    neq *= nyl;
+
+    /* Set up information for the third spatial dimension */
+    npz = dims[2];
+    nz  = npts[2];
+    az  = a[2];
+    bz  = b[2];
+    dz  = (bz-az) / (realtype) nz;
+    int ks = nz*(coords[2])/npz;
+    int ke = nz*(coords[2]+1)/npz-1;
+    nzl = ke-ks+1;
+    neq *= nzl;
+
+    /* Allocate buffers for nearest-neighbor exchange */
+    if (st == StencilType::UPWIND)
+      AllocateBuffersUpwind();
+
+  }
+
+  // TODO:
+  //  - support non-periodic boundary conditions
+  // For all faces where neighbors exist: determine neighbor process indices.
+  // For all faces: allocate upwind exchange buffers.
+  void AllocateBuffersUpwind()
+  {
+
+    /* Allocate send/receive buffers and determine ID for communication West */
+    if (upwindRight) {
+      Wrecv_  = Vec1D("Wrecv", dof*nyl*nzl);
+      WrecvH_ = Kokkos::create_mirror_view(Wrecv_);
+    } else {
+      Wsend_  = Vec1D("Wsend", dof*nyl*nzl);
+      WsendH_ = Kokkos::create_mirror_view(Wsend_);
+    }
+    ipW = MPI_PROC_NULL;
+    if ((coords[0] > 0) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0]-1, coords[1], coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipW);
+      assert(retval == MPI_SUCCESS);
+    }
+
+    /* Allocate send/receive buffers and determine ID for communication East */
+    if (upwindRight) {
+      Esend_  = Vec1D("Esend", dof*nyl*nzl);
+      EsendH_ = Kokkos::create_mirror_view(Esend_);
+    } else {
+      Erecv_  = Vec1D("Erecv", dof*nyl*nzl);
+      ErecvH_ = Kokkos::create_mirror_view(Erecv_);
+    }
+    ipE = MPI_PROC_NULL;
+    if ((coords[0] < dims[0]-1) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0]+1, coords[1], coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipE);
+      assert(retval == MPI_SUCCESS);
+    }
+
+    /* Allocate send/receive buffers and determine ID for communication South */
+    if (upwindRight) {
+      Srecv_  = Vec1D("Srecv", dof*nxl*nzl);
+      SrecvH_ = Kokkos::create_mirror_view(Srecv_);
+    } else {
+      Ssend_  = Vec1D("Ssend", dof*nxl*nzl);
+      SsendH_ = Kokkos::create_mirror_view(Ssend_);
+    }
+    ipS = MPI_PROC_NULL;
+    if ((coords[1] > 0) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1]-1, coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipS);
+      assert(retval == MPI_SUCCESS);
+    }
+
+    /* Allocate send/receive buffers and determine ID for communication North */
+    if (upwindRight) {
+      Nsend_  = Vec1D("Nsend", dof*nxl*nzl);
+      NsendH_ = Kokkos::create_mirror_view(Nsend_);
+    } else {
+      Nrecv_  = Vec1D("Nrecv", dof*nxl*nzl);
+      NrecvH_ = Kokkos::create_mirror_view(Nrecv_);
+    }
+    ipN = MPI_PROC_NULL;
+    if ((coords[1] < dims[1]-1) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1]+1, coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipN);
+      assert(retval == MPI_SUCCESS);
+    }
+
+    /* Allocate send/receive buffers and determine ID for communication Back */
+    if (upwindRight) {
+      Brecv_  = Vec1D("Brecv", dof*nxl*nyl);
+      BrecvH_ = Kokkos::create_mirror_view(Brecv_);
+    } else {
+      Bsend_  = Vec1D("Bsend", dof*nxl*nyl);
+      BsendH_ = Kokkos::create_mirror_view(Bsend_);
+    }
+    ipB = MPI_PROC_NULL;
+    if ((coords[2] > 0) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1], coords[2]-1};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipB);
+      assert(retval == MPI_SUCCESS);
+    }
+
+    /* Allocate send/receive buffers and determine ID for communication Front */
+    if (upwindRight) {
+      Fsend_  = Vec1D("Fsend", dof*nxl*nyl);
+      FsendH_ = Kokkos::create_mirror_view(Fsend_);
+    } else {
+      Frecv_  = Vec1D("Frecv", dof*nxl*nyl);
+      FrecvH_ = Kokkos::create_mirror_view(Frecv_);
+    }
+    ipF = MPI_PROC_NULL;
+    if ((coords[2] < dims[2]-1) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1], coords[2]+1};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipF);
+      assert(retval == MPI_SUCCESS);
+    }
+
+  }
+
+  // Initiate non-blocking neighbor communication
+  int ExchangeStart()
+  {
+    int retval = 0;
+    nreq = 0;
+
+    // Initialize all requests in array
+    for (int i=0; i<12; i++)
+      req[i] = MPI_REQUEST_NULL;
+
+    // Open an Irecv buffer on host for each neighbor
+    if ((ipW != MPI_PROC_NULL) && (upwindRight))
+    {
+      retval = MPI_Irecv(WrecvH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipW,
+                         1, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipE != MPI_PROC_NULL) && (!upwindRight))
+    {
+      retval = MPI_Irecv(ErecvH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipE,
+                         0, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipS != MPI_PROC_NULL) && (upwindRight))
+    {
+      retval = MPI_Irecv(SrecvH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipS,
+                         3, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipN != MPI_PROC_NULL) && (!upwindRight))
+    {
+      retval = MPI_Irecv(NrecvH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipN,
+                         2, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipB != MPI_PROC_NULL) && (upwindRight))
+    {
+      retval = MPI_Irecv(BrecvH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipB,
+                         5, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipF != MPI_PROC_NULL) && (!upwindRight))
+    {
+      retval = MPI_Irecv(FrecvH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipF,
+                         4, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    // Send data to neighbors, first copying from device to host buffers
+    if ((ipW != MPI_PROC_NULL) && (!upwindRight))
+    {
+      Kokkos::deep_copy(WsendH_, Wsend_);
+      retval = MPI_Isend(WsendH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, 0,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipE != MPI_PROC_NULL) && (upwindRight))
+    {
+      Kokkos::deep_copy(EsendH_, Esend_);
+      retval = MPI_Isend(EsendH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, 1,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipS != MPI_PROC_NULL) && (!upwindRight))
+    {
+      Kokkos::deep_copy(SsendH_, Ssend_);
+      retval = MPI_Isend(SsendH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, 2,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipN != MPI_PROC_NULL) && (upwindRight))
+    {
+      Kokkos::deep_copy(NsendH_, Nsend_);
+      retval = MPI_Isend(NsendH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, 3,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipB != MPI_PROC_NULL) && (!upwindRight))
+    {
+      Kokkos::deep_copy(BsendH_, Bsend_);
+      retval = MPI_Isend(BsendH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, 4,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipF != MPI_PROC_NULL) && (upwindRight))
+    {
+      Kokkos::deep_copy(FsendH_, Fsend_);
+      retval = MPI_Isend(FsendH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, 5,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    return retval;
+  }
+
+  // Waits for neighbor exchange to finish.
+  int ExchangeEnd()
+  {
+    MPI_Status stat[12];
+    int retval;
+
+    // return automatically with success if there are no outstanding requests
+    if (nreq == 0)
+      return(0);
+
+    // Wait for messages to finish send/receive
+    retval = MPI_Waitall(nreq, req, stat);
+    assert(retval == MPI_SUCCESS);
+
+    // Copy data from host to device buffers
+    if ((ipW != MPI_PROC_NULL) && (upwindRight))
+      Kokkos::deep_copy(Wrecv_, WrecvH_);
+    if ((ipE != MPI_PROC_NULL) && (!upwindRight))
+      Kokkos::deep_copy(Erecv_, ErecvH_);
+    if ((ipS != MPI_PROC_NULL) && (upwindRight))
+      Kokkos::deep_copy(Srecv_, SrecvH_);
+    if ((ipN != MPI_PROC_NULL) && (!upwindRight))
+      Kokkos::deep_copy(Nrecv_, NrecvH_);
+    if ((ipB != MPI_PROC_NULL) && (upwindRight))
+      Kokkos::deep_copy(Brecv_, BrecvH_);
+    if ((ipF != MPI_PROC_NULL) && (!upwindRight))
+      Kokkos::deep_copy(Frecv_, FrecvH_);
+
+    return retval;
+  }
+
+  // Prints out information about the ParallelGrid to stdout.
+  void PrintInfo()
+  {
+    printf("ParallelGrid Info:\n");
+    printf("    dimensions = %d\n", 3);
+    printf("    processors = {%d, %d, %d}\n", npx, npy, npz);
+    printf("        domain = {[%g,%g], [%g,%g], [%g,%g]}\n", ax, bx, ay, by, az, bz);
+    printf("   global npts = {%li, %li, %li}\n", (long int) nx, (long int) ny, (long int) nz);
+    printf("    local npts = {%d, %d, %d}\n", nxl, nyl, nzl);
+    printf("  mesh spacing = {%g, %g, %g}\n", dx, dy, dz);
+    if (upwindRight)
+      printf("    upwind dir = right\n");
+    else
+      printf("    upwind dir = left\n");
+  }
+
+  // Saves the mesh to a file.
+  //    First row is x. Second row is y. Third row is z.
+  //    Can be loaded into MATLAB like so:
+  //      mesh = loadtxt('mesh.txt');
+  //      [X,Y,Z] = meshgrid(mesh(1,:),mesh(2,:),mesh(3,:));
+  void MeshToFile(const std::string& fname)
+  {
+    std::ofstream mesh_file;
+    mesh_file.open(fname);
+    mesh_file << std::setprecision(16);
+    for (GLOBALINT i = 0; i < nx; i++)
+      mesh_file << " " << dx*i;
+    mesh_file << std::endl;
+    for (GLOBALINT i = 0; i < ny; i++)
+      mesh_file << " " << dy*i;
+    mesh_file << std::endl;
+    for (GLOBALINT i = 0; i < nz; i++)
+      mesh_file << " " << dz*i;
+    mesh_file << std::endl;
+    mesh_file.close();
+  }
+
+  int nprocs() const
+  {
+    return npx*npy*npz;
+  }
+
+  GLOBALINT npts() const
+  {
+    return nx*ny*nz;
+  }
+
+  GLOBALINT nptsl() const
+  {
+    return nxl*nyl*nzl;
+  }
+
+  GLOBALINT neql() const
+  {
+    return dof*nptsl();
+  }
+
+  realtype* GetRecvView(const std::string& direction)
+  {
+    if (direction == "WEST")
+    {
+      return static_cast<realtype*>(Wrecv_.data());
+    }
+    else if (direction == "EAST")
+    {
+      return static_cast<realtype*>(Erecv_.data());
+    }
+    else if (direction == "NORTH")
+    {
+      return static_cast<realtype*>(Nrecv_.data());
+    }
+    else if (direction == "SOUTH")
+    {
+      return static_cast<realtype*>(Srecv_.data());
+    }
+    else if (direction == "FRONT")
+    {
+      return static_cast<realtype*>(Frecv_.data());
+    }
+    else if (direction == "BACK")
+    {
+      return static_cast<realtype*>(Brecv_.data());
+    }
+    else
+    {
+      assert(direction == "ILLEGAL");
+      return nullptr;
+    }
+  }
+
+  realtype* GetSendView(const std::string& direction)
+  {
+    if (direction == "WEST")
+    {
+      return static_cast<realtype*>(Wsend_.data());
+    }
+    else if (direction == "EAST")
+    {
+      return static_cast<realtype*>(Esend_.data());
+    }
+    else if (direction == "NORTH")
+    {
+      return static_cast<realtype*>(Nsend_.data());
+    }
+    else if (direction == "SOUTH")
+    {
+      return static_cast<realtype*>(Ssend_.data());
+    }
+    else if (direction == "FRONT")
+    {
+      return static_cast<realtype*>(Fsend_.data());
+    }
+    else if (direction == "BACK")
+    {
+      return static_cast<realtype*>(Bsend_.data());
+    }
+    else
+    {
+      assert(direction == "ILLEGAL");
+      return nullptr;
+    }
+  }
+
+  GLOBALINT nx, ny, nz;    /* number of intervals globally       */
+  int       nxl, nyl, nzl; /* number of intervals locally        */
+  int       npx, npy, npz; /* numner of processes                */
+  realtype  dx, dy, dz;    /* mesh spacing                       */
+  realtype  ax, ay, az;    /* domain in [a, b]                   */
+  realtype  bx, by, bz;
+  int       dof;           /* degrees of freedom per node        */
+  int       neq;           /* total number of equations locally  */
+
+  int       ipW, ipE;      /* MPI ranks for neighbor procs       */
+  int       ipS, ipN;
+  int       ipB, ipF;
+  bool      upwindRight;   /* Upwind dir: true/false == R/L      */
+
+  int       dims[3];
+  int       coords[3];
+
+
+private:
+  MPI_Comm     cart_comm;  /* MPI cartesian communicator         */
+  MPI_Request  req[12];
+  int          nreq;
+
+  BoundaryType bc;
+  StencilType  st;
+  
+  Vec1D Wsend_;            /* MPI send/recv buffers              */
+  Vec1D Esend_;
+  Vec1D Ssend_;
+  Vec1D Nsend_;
+  Vec1D Bsend_;
+  Vec1D Fsend_;
+  Vec1D Wrecv_;
+  Vec1D Erecv_;
+  Vec1D Srecv_;
+  Vec1D Nrecv_;
+  Vec1D Brecv_;
+  Vec1D Frecv_;
+  Vec1DHost WsendH_;       /* MPI send/recv buffers (host)       */
+  Vec1DHost EsendH_;
+  Vec1DHost SsendH_;
+  Vec1DHost NsendH_;
+  Vec1DHost BsendH_;
+  Vec1DHost FsendH_;
+  Vec1DHost WrecvH_;
+  Vec1DHost ErecvH_;
+  Vec1DHost SrecvH_;
+  Vec1DHost NrecvH_;
+  Vec1DHost BrecvH_;
+  Vec1DHost FrecvH_;
+
+};
+
+}
+
+#endif
diff --git a/benchmarks/advection_reaction_3D/kokkos/README.md b/benchmarks/advection_reaction_3D/kokkos/README.md
new file mode 100644
index 0000000000..f27484385f
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/README.md
@@ -0,0 +1,113 @@
+# Benchmark: 3D Advection-Reaction
+
+This benchmark problem implements a 3D advection-reaction equation using the
+Kokkos performance portability layer with serial, OpenMP, CUDA, or HIP backends.
+
+## Problem description
+
+This code simulates the advection and reaction of three chemical species where
+the reaction mechanism is a variation of the Brusselator problem from chemical
+kinetics. The PDE system is given by
+```math
+\begin{align}
+  u_t &= -c \nabla u + A - (w+1) u + v u^2 \\
+  v_t &= -c \nabla v + w u - v u^2 \\
+  w_t &= -c \nabla w + (B - w) / \epsilon - w u
+\end{align}
+```
+where $u$, $v$, and $w$ are chemical concentrations, $c$ is the advection speed,
+$A$ and $B$ are the concentrations of chemical species that remain constant over
+space and time, and $\epsilon$ is a parameter that varies the stiffness of the
+system. The problem is solved on the domain $(x,y,z) = X$ in $[0, X_{\text{max}}]^3$, 
+for times $t$ in $[0,t_f]$. The initial condition is
+```math
+\begin{align}
+    u(0,X) &= A + p(X) \\
+    v(0,X) &= B / A + p(X) \\
+    w(0,X) &= 3.0 + p(X)
+\end{align}
+```
+where the perturbation function is
+```math
+    p(X) = \alpha e^{-(X-\mu)^T \sigma^{-1} (X-\mu) / 2 \sqrt{|\sigma| 8 \pi^3}}
+```
+with $\alpha = 0.1$, $\mu = 0.5 X_{\text{max}}$, and $\sigma$ is a diagonal 
+matrix with entries $0.25 X_{\text{max}}$.
+
+Spatial derivatives are discretized with first-order upwind finite differences
+on a uniform spatial grid. The system can be evolved in time using explicit,
+implicit, or IMEX methods from ARKODE, Adams or BDF methods from CVODE, or BDF
+methods from IDA. When using an IMEX method, advection is treated explicitly and
+reactions implicitly.
+
+The nonlinear system(s) that arise in each time step may be solved using a
+global Newton method with a matrix-free GMRES linear solver or an Anderson
+accelerated fixed-point method. When using an IMEX method, a custom task-local
+nonlinear solver that leverages the locality of the reaction systems may also be
+used.
+
+## Options
+
+Several command line options are available to change the problem parameters
+as well as the integrator and solver options. A summary of the options are
+listed below.
+
+| Option                      | Description                                                                   | Default     |
+|:----------------------------|:------------------------------------------------------------------------------|:------------|
+| `--help`                    | Print the command line options and description                                | --          |
+| `--dont-save`               | Do not save the solution to the disk                                          | Save        |
+| `--output-dir <dir>`        | Directory where all output files will be written                              | `.`         |
+| `--nout <int>`              | Number of output times                                                        | 40          |
+| `--npts <int>`              | Number of mesh points in each direction                                       | 100         |
+| `--npxyz <int> <int> <int>` | Number of MPI tasks in each direction (0 forces MPI to decide)                | 0 0 0       |
+| `--xmax <realtype>`         | Maximum value of `x`, `y`, and `z` in :math:`X_max`                           | 1.0         |
+| `--A <realtype>`            | Constant concentration of species `A`                                         | 1.0         |
+| `--B <realtype>`            | Constant concentration of species `B`                                         | 3.5         |
+| `--c <realtype>`            | Advection speed `c`                                                           | 0.01        |
+| `--order <int>`             | Integration method order                                                      | 3           |
+| `--method <method>`         | Integrator to use: `ERK`, `ARK-DIRK`, `ARK-IMEX`, `CV-BDF`, `CV-ADAMS`, `IDA` | `ARK-DIRK`  |
+| `--nls <method>`            | Nonlinear Solver Method: `newton`, `tl-newton`, `fixedpoint`, `none`          | `newton`    |
+| `--fpaccel <int>`           | Number of fixed point acceleration vectors                                    | 3           |
+| `--nopre`                   | Disable preconditioning                                                       | False       |
+| `--fused`                   | Enabled fused operations                                                      | Off         |
+| `--tf <realtype>`           | Final integration time `t_f`                                                  | 10.0        |
+| `--rtol <realtype>`         | Relative tolerance                                                            | 1.0e-6      |
+| `--atol <realtype>`         | Absolute tolerance                                                            | 1.0e-9      |
+
+## Building and Running
+
+To build the benchmark executables SUNDIALS must be configured with ARKODE,
+CVODE, and IDA enabled and with MPI and Kokkos support on. Additionally, either
+CUDA or HIP support must be on to build executables utilizing NVIDIA or AMD
+GPUs. See the installation guide for more details on configuring, building,
+and installing SUNDIALS.
+
+Based on the configuration the following executables will be built and installed
+in the `<benchmarks install prefix>/advection_reaction_3D/kokkos` directory:
+
+* `advection_reaction_3D_kokkos.SERIAL` -- MPI parallelism
+* `advection_reaction_3D_kokkos.OPENMP` -- MPI + OpenMP parallelism
+* `advection_reaction_3D_kokkos.CUDA` -- MPI + CUDA parallelism
+* `advection_reaction_3D_kokkos.HIP` -- MPI + HIP parallelism
+
+On Summit, with the default environment
+```
+  Compiler: xl/16.1.1-5
+  MPI: spectrum-mpi/10.3.1.2-20200121
+  CUDA: cuda/10.1.243
+```
+an example `jsrun` command is
+```
+jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_kokkos.CUDA
+```
+
+On Lassen, with the environment
+```
+  Compiler: gcc/8.3.1
+  MPI: mvapich2/2021.05.28-cuda-11.1.1
+  CUDA: cuda/11.1.1
+```
+an example `jsrun` command is
+```
+jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_kokkos.CUDA
+```
diff --git a/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.cpp b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.cpp
new file mode 100644
index 0000000000..fa9f2bcc94
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.cpp
@@ -0,0 +1,711 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Daniel R. Reynolds @ SMU
+ *                David J. Gardner, Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------
+ * This benchmark problem simulates the advection and reaction of three
+ * chemical species, u, v, and w, in a three dimensional domain. The reaction
+ * mechanism is a variation of the Brusselator problem from chemical kinetics.
+ * This is a PDE system with 3 components, Y = [u,v,w], satisfying the
+ * equations,
+ *
+ *    u_t = -c * dot(grad,u) + A - (w+1) * u + v * u^2
+ *    v_t = -c * dot(grad,v) + w * u - v * u^2
+ *    w_t = -c * dot(grad,w) + (B - w) / ep - w * u
+ *
+ * for t in [0,tf], X = (x,y,z) where in (x,y,z) in [0,xmax] with periodic
+ * boundary conditions. The initial condition is
+ *
+ *    u(0,X) = k1 * A / k4 + p(X)
+ *    v(0,X) = k2 * k4 * B / (k1 * k3 * A) + p(X)
+ *    w(0,X) = 3.0 + p(X)
+ *    p(X)   = alpha * e^( -((X - mu)^T Sigma^{-1} (x-mu)) / (2*sqrt(|Sigma|*(2pi)^3)) )
+ *
+ * alpha = 0.1, mu = (xmax/2.0, xmax/2.0, xmax/2.0), and Sigma = diag(xmax/4.0).
+ * The reaction rates are set so k_1 = k_2 = k_3 = k_4 = k, and k_5 = k_6
+ * = 1/5e-6. The spatial derivatives are discretized with first-order upwind
+ * finite differences. NOUT outputs are printed at equal intervals, and run
+ * statistics are printed at the end.
+ *
+ * Command line options:
+ *   --help             prints this message
+ *   --dont-save        do not save the solution to the filesystem at the nout interval (default is to save)
+ *   --output-dir       the directory where all output files will be written
+ *   --nout <int>       number of output times
+ *   --method           ERK, ARK-DIRK, ARK-IMEX (default), CV-BDF, CV-ADAMS, IDA
+ *   --nls              nonlinear solver to use; options are newton,
+ *                      tl-newton (task-local newton), or fixedpoint
+ *   --fpaccel          the number of fixed-point acceleration vectors to use
+ *                      (only valid when using fixedpoint nonlinear solver)
+ *   --nopre            turn off preconditioning
+ *   --order <int>      the method order to use
+ *   --npts <int>       number of mesh points in each direction
+ *   --xmax <realtype>  maximum value of x (size of domain)
+ *   --tf <realtype>    final time
+ *   --A <realtype>     A parameter value
+ *   --B <realtype>     B parameter value
+ *   --k <realtype>     reaction rate
+ *   --c <realtype>     advection speed
+ *   --rtol <realtype>  relative tolerance
+ *   --atol <realtype>  absolute tolerance
+ * --------------------------------------------------------------------------*/
+
+#include "advection_reaction_3D.hpp"
+
+
+/* Main Program */
+int main(int argc, char *argv[])
+{
+
+  SUNContext ctx;
+
+  /* Initialize MPI */
+  MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Init(&argc, &argv);
+
+  /* Create SUNDIALS context */
+  SUNContext_Create((void*) &comm, &ctx);
+
+  /* Initialize Kokkos */
+  Kokkos::initialize(argc, argv);
+  {
+
+    /* General problem variables */
+    N_Vector     y = NULL;      /* empty solution vector        */
+    UserData     udata(ctx);    /* user data                    */
+    UserOptions  uopt;          /* user options                 */
+    int          retval;        /* reusable error-checking flag */
+
+    SUNDIALS_CXX_MARK_FUNCTION(udata.prof);
+
+    /* Process input arguments and set up the problem */
+    retval = SetupProblem(argc, argv, &udata, &uopt, ctx);
+    if (check_retval(&retval, "SetupProblem", 1, udata.myid)) MPI_Abort(comm, 1);
+
+    /* Create solution vector (on-node and MPI-parallel versions) */
+    SUNVector yloc{(unsigned int)udata.grid->neq, ctx};
+    y = N_VMake_MPIPlusX(udata.comm, yloc, ctx);
+    if (check_retval((void *) y, "N_VMake_MPIPlusX", 0, udata.myid)) MPI_Abort(comm, 1);
+
+    /* Set the initial condition */
+    retval = SetIC(y, &udata);
+    if (check_retval(&retval, "SetIC", 1, udata.myid)) MPI_Abort(comm, 1);
+
+    /* Output spatial mesh to disk (add extra point for periodic BC) */
+    if (udata.myid == 0 && uopt.nout > 0)
+    {
+      char fname[MXSTR];
+      snprintf(fname, MXSTR, "%s/mesh.txt", uopt.outputdir);
+      udata.grid->MeshToFile(fname);
+    }
+
+    /* Integrate in time */
+    if (uopt.method == "ERK")           retval = EvolveProblemExplicit(y, &udata, &uopt);
+    else if (uopt.method == "ARK-DIRK") retval = EvolveProblemDIRK(y, &udata, &uopt);
+    else if (uopt.method == "ARK-IMEX") retval = EvolveProblemIMEX(y, &udata, &uopt);
+    else if (uopt.method == "CV-BDF")   retval = EvolveProblemBDF(y, &udata, &uopt);
+    else if (uopt.method == "CV-ADAMS") retval = EvolveProblemAdams(y, &udata, &uopt);
+    else if (uopt.method == "IDA")      retval = EvolveDAEProblem(y, &udata, &uopt);
+    if (check_retval(&retval, "Evolve", 1, udata.myid)) MPI_Abort(comm, 1);
+
+    /* Clean up */
+    N_VDestroy(y);
+  }
+  Kokkos::finalize();
+  SUNContext_Free(&ctx);
+  MPI_Finalize();
+  return(0);
+}
+
+
+/* Destructor for problem data */
+UserData::~UserData()
+{
+  /* close output streams */
+  if (uopt->nout > 0)
+  {
+    if (UFID) fclose(UFID);
+    if (VFID) fclose(VFID);
+    if (WFID) fclose(WFID);
+    if (TFID && myid == 0) fclose(TFID);
+  }
+
+  /* free solution masks */
+  if (umask != nullptr) {
+    N_VDestroy(umask);
+    umask = nullptr;
+  }
+  if (vmask != nullptr) {
+    N_VDestroy(vmask);
+    vmask = nullptr;
+  }
+  if (wmask != nullptr) {
+    N_VDestroy(wmask);
+    wmask = nullptr;
+  }
+
+  /* free the parallel grid */
+  delete grid;
+}
+
+
+/* --------------------------------------------------------------
+ * Communication functions
+ * --------------------------------------------------------------*/
+
+/* Fills send buffers before exchanging neighbor information */
+int FillSendBuffers(N_Vector y, UserData* udata)
+{
+
+  /* Shortcuts */
+  const realtype c = udata->c;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const int dof = udata->grid->dof;
+
+  /* Create 4D view of the vector */
+  Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+
+  if (c > 0.0)
+  {
+
+    /* Flow moving in the positive directions uses backward difference. */
+
+    /* Create 4D views of send buffers */
+    Vec4D Esend(udata->grid->GetSendView("EAST"),  1, nyl, nzl, dof);
+    Vec4D Nsend(udata->grid->GetSendView("NORTH"), nxl, 1, nzl, dof);
+    Vec4D Fsend(udata->grid->GetSendView("FRONT"), nxl, nyl, 1, dof);
+
+    /* Fill buffers on device */
+    Kokkos::parallel_for("FillEastBuffer",
+                         Range3D({0,0,0},{nyl,nzl,dof}),
+                         KOKKOS_LAMBDA (int j, int k, int l) {
+      Esend(0,j,k,l) = Yview(nxl-1,j,k,l);
+    });
+    Kokkos::parallel_for("FillNorthBuffer",
+                         Range3D({0,0,0},{nxl,nzl,dof}),
+                         KOKKOS_LAMBDA (int i, int k, int l) {
+      Nsend(i,0,k,l) = Yview(i,nyl-1,k,l);
+    });
+    Kokkos::parallel_for("FillFrontBuffer",
+                         Range3D({0,0,0},{nxl,nyl,dof}),
+                         KOKKOS_LAMBDA (int i, int j, int l) {
+      Fsend(i,j,0,l) = Yview(i,j,nzl-1,l);
+    });
+
+  }
+  else if (c < 0.0)
+  {
+
+    /* Flow moving in the negative directions uses forward difference. */
+
+    /* Create 4D views of send buffers */
+    Vec4D Wsend(udata->grid->GetSendView("WEST"),  1, nyl, nzl, dof);
+    Vec4D Ssend(udata->grid->GetSendView("SOUTH"), nxl, 1, nzl, dof);
+    Vec4D Bsend(udata->grid->GetSendView("BACK"),  nxl, nyl, 1, dof);
+
+    /* Fill buffers on device */
+    Kokkos::parallel_for("FillWestBuffer",
+                         Range3D({0,0,0},{nyl,nzl,dof}),
+                         KOKKOS_LAMBDA (int j, int k, int l) {
+      Wsend(0,j,k,l) = Yview(0,j,k,l);
+    });
+    Kokkos::parallel_for("FillSouthBuffer",
+                         Range3D({0,0,0},{nxl,nzl,dof}),
+                         KOKKOS_LAMBDA (int i, int k, int l) {
+      Ssend(i,0,k,l) = Yview(i,0,k,l);
+    });
+    Kokkos::parallel_for("FillBackBuffer",
+                         Range3D({0,0,0},{nxl,nyl,dof}),
+                         KOKKOS_LAMBDA (int i, int j, int l) {
+      Bsend(i,j,0,l) = Yview(i,j,0,l);
+    });
+
+  }
+
+  return(0);
+}
+
+
+/* --------------------------------------------------------------
+ * Problem setup
+ * --------------------------------------------------------------*/
+
+/* Parses the CLI arguments */
+int ParseArgs(int argc, char *argv[], UserData* udata, UserOptions* uopt)
+{
+  /* check for input args */
+  if (argc > 1)
+  {
+    /* loop over input args and get value */
+    for (int i = 1; i < argc; i++)
+    {
+      string argvi(argv[i]);
+
+      if (argvi.compare("--help") == 0)
+      {
+        InputError(argv[0]);
+        return(-1);
+      }
+      else if (argvi.compare("--nout") == 0)
+      {
+        uopt->nout = atoi(argv[++i]);
+      }
+      else if (argvi.compare("--dont-save") == 0)
+      {
+        uopt->save = 0;
+      }
+      else if (argvi.compare("--output-dir") == 0)
+      {
+        if (strlen(argv[i+1]) > MXSTR)
+        {
+          if (udata->myid == 0)
+            fprintf(stderr, "ERROR: output directory string is too long\n");
+          return(-1);
+        }
+        uopt->outputdir = argv[++i];
+      }
+      else if (argvi.compare("--npts") == 0)
+      {
+        uopt->npts = atoi(argv[++i]);
+      }
+      else if (argvi.compare("--npxyz") == 0)
+      {
+        uopt->npxyz[0] = atoi(argv[++i]);
+        uopt->npxyz[1] = atoi(argv[++i]);
+        uopt->npxyz[2] = atoi(argv[++i]);
+      }
+      else if (argvi.compare("--xmax") == 0)
+      {
+        udata->xmax = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--A") == 0)
+      {
+        udata->A = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--B") == 0)
+      {
+        udata->B = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--k") == 0)
+      {
+        udata->k1 = strtod(argv[++i], NULL);
+        udata->k2 = strtod(argv[++i], NULL);
+        udata->k3 = strtod(argv[++i], NULL);
+        udata->k4 = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--c") == 0)
+      {
+        udata->c = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--order") == 0)
+      {
+        uopt->order = atoi(argv[++i]);
+      }
+      else if (argvi.compare("--method") == 0)
+      {
+        uopt->method = string(argv[++i]);
+        if (uopt->method != "ERK" &&
+            uopt->method != "ARK-DIRK" &&
+            uopt->method != "ARK-IMEX" &&
+            uopt->method != "CV-BDF" &&
+            uopt->method != "CV-ADAMS" &&
+            uopt->method != "IDA")
+        {
+          fprintf(stderr, "ERROR: unknown method\n");
+          InputError(argv[0]);
+          return(-1);
+        }
+      }
+      else if (argvi.compare("--fpaccel") == 0)
+      {
+        uopt->fpaccel = atoi(argv[++i]);
+      }
+      else if (argvi.compare("--nls") == 0)
+      {
+        uopt->nls = string(argv[++i]);
+        if (uopt->nls != "newton" &&
+            uopt->nls != "tl-newton" &&
+            uopt->nls != "fixedpoint" &&
+            uopt->nls != "none")
+        {
+          fprintf(stderr, "ERROR: unknown nls\n");
+          InputError(argv[0]);
+          return(-1);
+        }
+      }
+      else if (argvi.compare("--nopre") == 0)
+      {
+        uopt->precond = 0;
+      }
+      else if (argvi.compare("--fused") == 0)
+      {
+        uopt->fused = 1;
+      }
+      else if (argvi.compare("--tf") == 0)
+      {
+        uopt->tf = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--rtol") == 0)
+      {
+        uopt->rtol = strtod(argv[++i], NULL);
+      }
+      else if (argvi.compare("--atol") == 0)
+      {
+        uopt->atol = strtod(argv[++i], NULL);
+      }
+      else
+      {
+        InputError(argv[0]);
+        return(-1);
+      }
+    }
+  }
+
+  /* Explicit method uses no nonlinear solver */
+  if (uopt->method == "ERK")
+    uopt->nls = "none";
+
+  /* CV Adams method only uses fixedpoint nonlinear solver */
+  if (uopt->method == "CV-ADAMS")
+    uopt->nls = "fixedpoint";
+
+  return(0);
+}
+
+
+/* Fills the mask vector for the component so that
+   u = y .* umask, v = y .* vmask, w = y .* wmask */
+int ComponentMask(N_Vector mask, const int component, const UserData* udata)
+{
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* Shortcuts */
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const int dof = udata->grid->dof;
+
+  /* Create 4D view of mask data */
+  Vec4D maskview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(mask)), nxl, nyl, nzl, dof);
+
+  /* Fill mask data */
+  N_VConst(0.0, mask);
+  Kokkos::parallel_for("Fill_mask",
+                       Range3D({0,0,0},{nxl,nyl,nzl}),
+                       KOKKOS_LAMBDA (int i, int j, int k)
+  {
+    maskview(i,j,k,component) = 1.0;
+  });
+
+  return 0;
+}
+
+
+/* Parses the CLI arguments and sets up the problem */
+int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
+                 SUNContext ctx)
+{
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* MPI variables */
+  udata->comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(udata->comm, &udata->myid);
+  MPI_Comm_size(udata->comm, &udata->nprocs);
+
+  /* Default problem parameters */
+  udata->add_reactions = true;
+  udata->xmax  = 1.0;
+  udata->A     = 1.0;
+  udata->B     = 3.5;
+  udata->k1    = 1.0;
+  udata->k2    = 1.0;
+  udata->k3    = 1.0;
+  udata->k4    = 1.0;
+  udata->k5    = 1.0/5.0e-6;
+  udata->k6    = 1.0/5.0e-6;
+  udata->c     = 0.01;
+  udata->uopt  = uopt;
+  udata->TFID  = NULL;
+  udata->UFID  = NULL;
+  udata->VFID  = NULL;
+  udata->WFID  = NULL;
+  udata->nnlfi = 0;
+
+  /* Set default integrator options */
+  uopt->npxyz[0]  = 0;            /* number of processesors in x */
+  uopt->npxyz[1]  = 0;            /* number of processesors in y */
+  uopt->npxyz[2]  = 0;            /* number of processesors in z */
+  uopt->npts      = 100;          /* number of mesh points in each direction */
+  uopt->order     = 3;            /* method order             */
+  uopt->method    = "ARK-DIRK";   /* stepper/method           */
+  uopt->t0        = 0.0;          /* initial time             */
+  uopt->tf        = 10.0;         /* final time               */
+  uopt->rtol      = 1.0e-6;       /* relative tolerance       */
+  uopt->atol      = 1.0e-9;       /* absolute tolerance       */
+  uopt->nls       = "newton";     /* default to newton, when appropriate */
+  uopt->fpaccel   = 3;            /* default number of fixed point acceleration vectors */
+  uopt->precond   = 1;            /* by default, precondition when appropriate */
+  uopt->fused     = 0;            /* use fused vector ops     */
+  uopt->save      = 1;            /* save solution to disk    */
+  uopt->nout      = 10;           /* number of output times   */
+  uopt->outputdir = (char *) "."; /* output directory         */
+
+  /* Parse CLI args and set udata/uopt appropriately */
+  int retval = ParseArgs(argc, argv, udata, uopt);
+  if (check_retval((void*)&retval, "ParseArgs", 1, udata->myid)) return -1;
+
+  /* Setup the parallel decomposition */
+  const sunindextype npts[] = {uopt->npts, uopt->npts, uopt->npts};
+  const realtype amax[] = {0.0, 0.0, 0.0};
+  const realtype bmax[] = {udata->xmax, udata->xmax, udata->xmax};
+  udata->grid = new ParallelGrid<sunindextype>(&udata->comm, amax, bmax, npts,
+      3, BoundaryType::PERIODIC, StencilType::UPWIND, udata->c, uopt->npxyz);
+
+  /* Create the solution masks */
+  SUNVector *umaskloc = new SUNVector((unsigned int)udata->grid->neq, ctx);
+  udata->umask = N_VMake_MPIPlusX(udata->comm, *umaskloc, ctx);
+  if (check_retval((void *) udata->umask, "N_VMake_MPIPlusX", 0, udata->myid)) MPI_Abort(udata->comm, 1);
+  SUNVector *vmaskloc = new SUNVector((unsigned int)udata->grid->neq, ctx);
+  udata->vmask = N_VMake_MPIPlusX(udata->comm, *vmaskloc, ctx);
+  if (check_retval((void *) udata->vmask, "N_VMake_MPIPlusX", 0, udata->myid)) MPI_Abort(udata->comm, 1);
+  SUNVector *wmaskloc = new SUNVector((unsigned int)udata->grid->neq, ctx);
+  udata->wmask = N_VMake_MPIPlusX(udata->comm, *wmaskloc, ctx);
+  if (check_retval((void *) udata->wmask, "N_VMake_MPIPlusX", 0, udata->myid)) MPI_Abort(udata->comm, 1);
+  ComponentMask(udata->umask, 0, udata);
+  ComponentMask(udata->vmask, 1, udata);
+  ComponentMask(udata->wmask, 2, udata);
+
+  /* Open output files for results */
+  if (uopt->save)
+  {
+    char fname[MXSTR];
+    if (udata->myid == 0)
+    {
+      sprintf(fname, "%s/t.%06d.txt", uopt->outputdir, udata->myid);
+      udata->TFID = fopen(fname, "w");
+    }
+
+    sprintf(fname, "%s/u.%06d.txt", uopt->outputdir, udata->myid);
+    udata->UFID = fopen(fname, "w");
+
+    sprintf(fname, "%s/v.%06d.txt", uopt->outputdir, udata->myid);
+    udata->VFID = fopen(fname, "w");
+
+    sprintf(fname, "%s/w.%06d.txt", uopt->outputdir, udata->myid);
+    udata->WFID = fopen(fname, "w");
+  }
+
+  /* Print problem setup */
+  if (udata->myid == 0)
+  {
+    printf("\n\t\tAdvection-Reaction Test Problem\n\n");
+    printf("Using the MPI+Kokkos NVECTOR");
+#if defined(USE_CUDA)
+    printf(" with the CUDA back-end\n");
+#elif defined(USE_HIP)
+    printf(" with the HIP back-end\n");
+#elif defined(USE_OPENMP)
+    printf(" with the OpenMP back-end and %i threads\n", omp_get_max_threads());
+#else
+    printf(" with the serial back-end\n");
+#endif
+    printf("Number of Processors = %li\n", (long int) udata->nprocs);
+    udata->grid->PrintInfo();
+    printf("Problem Parameters:\n");
+    printf("  A = %g\n", udata->A);
+    printf("  B = %g\n", udata->B);
+    printf("  k = %g\n", udata->k1);
+    printf("  c = %g\n", udata->c);
+    printf("Integrator Options:\n");
+    printf("  order            = %d\n", uopt->order);
+    printf("  method           = %s\n", uopt->method.c_str());
+    printf("  nonlinear solver = %s\n", uopt->nls.c_str());
+    printf("  fpaccel          = %d\n", uopt->fpaccel);
+    printf("  preconditioner   = %d\n", uopt->precond);
+    printf("  fused vector ops = %d\n", uopt->fused);
+    printf("  t0               = %g\n", uopt->t0);
+    printf("  tf               = %g\n", uopt->tf);
+    printf("  reltol           = %.1e\n", uopt->rtol);
+    printf("  abstol           = %.1e\n", uopt->atol);
+    printf("  nout             = %d\n", uopt->nout);
+    printf("Output directory: %s\n", uopt->outputdir);
+  }
+
+
+  /* return success */
+  return(0);
+}
+
+
+/* Compute the 3D Gaussian function. */
+KOKKOS_FUNCTION
+void Gaussian3D(realtype& x, realtype& y, realtype& z, realtype xmax)
+{
+  /* Gaussian distribution defaults */
+  const realtype alpha = 0.1;
+  const realtype mu[] = { xmax/RCONST(2.0), xmax/RCONST(2.0), xmax/RCONST(2.0) };
+  const realtype sigma[] = { xmax/RCONST(4.0), xmax/RCONST(4.0), xmax/RCONST(4.0) }; // Sigma = diag(sigma)
+
+  /* denominator = 2*sqrt(|Sigma|*(2pi)^3) */
+  const realtype denom = 2.0 * sqrt((sigma[0]*sigma[1]*sigma[2])*pow(2*M_PI,3));
+  x = alpha * exp( -((x - mu[0])*(x - mu[0])*(1.0/sigma[0])) / denom );
+  y = alpha * exp( -((y - mu[1])*(y - mu[1])*(1.0/sigma[1])) / denom );
+  z = alpha * exp( -((z - mu[2])*(z - mu[2])*(1.0/sigma[2])) / denom );
+}
+
+
+/* Initial condition function */
+int SetIC(N_Vector y, UserData* udata)
+{
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* Variable shortcuts */
+  const int      nxl  = udata->grid->nxl;
+  const int      nyl  = udata->grid->nyl;
+  const int      nzl  = udata->grid->nzl;
+  const int      dof  = udata->grid->dof;
+  const realtype dx   = udata->grid->dx;
+  const realtype dy   = udata->grid->dy;
+  const realtype dz   = udata->grid->dz;
+  const realtype xmax = udata->xmax;
+  const realtype A    = udata->A;
+  const realtype B    = udata->B;
+  const realtype k1   = udata->k1;
+  const realtype k2   = udata->k2;
+  const realtype k3   = udata->k3;
+  const realtype k4   = udata->k4;
+  const int      xcrd = udata->grid->coords[0];
+  const int      ycrd = udata->grid->coords[1];
+  const int      zcrd = udata->grid->coords[2];
+
+  /* Steady state solution */
+  const realtype us = k1 * A / k4;
+  const realtype vs = k2 * k4 * B / (k1 * k3 * A);
+  const realtype ws = 3.0;
+
+  /* Create 4D view of y */
+  Vec4D yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+
+  /* Gaussian perturbation of the steady state solution */
+  Kokkos::parallel_for("SetIC",
+                       Range3D({0,0,0},{nxl,nyl,nzl}),
+                       KOKKOS_LAMBDA (int i, int j, int k)
+  {
+    realtype x = (xcrd * nxl + i) * dx;
+    realtype y = (ycrd * nyl + j) * dy;
+    realtype z = (zcrd * nzl + k) * dz;
+    Gaussian3D(x,y,z,xmax);
+    const realtype p = x + y + z;
+    yview(i,j,k,0) = us + p;
+    yview(i,j,k,1) = vs + p;
+    yview(i,j,k,2) = ws + p;
+  });
+
+  /* Return success */
+  return(0);
+}
+
+
+/* Write time and solution to disk */
+int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* output current solution norm to screen */
+  realtype N = (realtype) udata->grid->npts();
+  realtype u = N_VWL2Norm(y, udata->umask);
+  u = sqrt(u*u/N);
+  realtype v = N_VWL2Norm(y, udata->vmask);
+  v = sqrt(v*v/N);
+  realtype w = N_VWL2Norm(y, udata->wmask);
+  w = sqrt(w*w/N);
+  if (udata->myid == 0) {
+    printf("     %10.6f   %10.6f   %10.6f   %10.6f\n", t, u, v, w);
+    std::fflush(stdout);
+  }
+
+  if (uopt->save)
+  {
+    /* Copy solution data to host mirror view */
+    SUNVector* ylocal = sundials::kokkos::GetVec<SUNVector>(N_VGetLocalVector_MPIPlusX(y));
+    sundials::kokkos::CopyFromDevice(*ylocal);
+
+    /* output the times to disk */
+    if (udata->myid == 0 && udata->TFID) {
+      fprintf(udata->TFID," %.16e\n", t);
+      std::fflush(udata->TFID);
+    }
+
+    /* create 4D view of host data */
+    const int nxl = udata->grid->nxl;
+    const int nyl = udata->grid->nyl;
+    const int nzl = udata->grid->nzl;
+    const int dof = udata->grid->dof;
+    Vec4DHost yview(N_VGetArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+
+    /* output results to disk */
+    for (int i = 0; i < nxl; i++)
+      for (int j = 0; j < nyl; j++)
+        for (int k = 0; k < nzl; k++) {
+          fprintf(udata->UFID," %.16e", yview(i,j,k,0));
+          fprintf(udata->VFID," %.16e", yview(i,j,k,1));
+          fprintf(udata->WFID," %.16e", yview(i,j,k,2));
+        }
+
+    fprintf(udata->UFID,"\n");
+    fprintf(udata->VFID,"\n");
+    fprintf(udata->WFID,"\n");
+    std::fflush(udata->UFID);
+    std::fflush(udata->VFID);
+    std::fflush(udata->WFID);
+  }
+
+  return(0);
+}
+
+
+void InputError(char *name)
+{
+  int myid;
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+  if (myid == 0)
+  {
+    fprintf(stderr, "\nERROR: Invalid command line input\n");
+    fprintf(stderr, "\nCommand line options for %s\n",name);
+    fprintf(stderr, "  --help                    prints this message\n");
+    fprintf(stderr, "  --output-dir              the directory where all output files will be written (default is the CWD)\n");
+    fprintf(stderr, "  --nout <int>              number of output times to print (default is 10)\n");
+    fprintf(stderr, "  --dont-save               do not save the solution to the filesystem at the nout interval (default is to save)\n");
+    fprintf(stderr, "  --method                  ERK, ARK-DIRK, ARK-IMEX (default), CV-BDF, CV-ADAMS, IDA\n");
+    fprintf(stderr, "  --fpaccel                 the number of fixed-point acceleration vectors to use (only valid when using fixedpoint nonlinear solver)\n");
+    fprintf(stderr, "  --nls                     nonlinear solver to use (newton, tl-newton (task-local newton), fixedpoint)\n");
+    fprintf(stderr, "  --nopre                   do not precondition the linear system\n");
+    fprintf(stderr, "  --order <int>             the method order to use\n");
+    fprintf(stderr, "  --npts <int>              number of mesh points in each direction\n");
+    fprintf(stderr, "  --npxyz <int> <int> <int> number of processors in each direction (0 forces MPI to decide)\n");
+    fprintf(stderr, "  --xmax <realtype>         maximum value of x (size of domain)\n");
+    fprintf(stderr, "  --tf <realtype>           final time\n");
+    fprintf(stderr, "  --A <realtype>            A parameter value\n");
+    fprintf(stderr, "  --B <realtype>            B parameter value\n");
+    fprintf(stderr, "  --k <realtype>            reaction rate\n");
+    fprintf(stderr, "  --c <realtype>            advection speed\n");
+    fprintf(stderr, "  --rtol <realtype>         relative tolerance\n");
+    fprintf(stderr, "  --atol <realtype>         absolute tolerance\n");
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+}
diff --git a/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.hpp b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.hpp
new file mode 100644
index 0000000000..cb0dceea64
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.hpp
@@ -0,0 +1,171 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Daniel R. Reynolds @ SMU
+ *                David J. Gardner, Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * ---------------------------------------------------------------------------*/
+
+#ifndef ADVECTION_REACTION_3D_HPP
+#define ADVECTION_REACTION_3D_HPP
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <mpi.h>
+
+#include <sundials/sundials_context.h>
+#include <nvector/nvector_mpiplusx.h>
+#include "nvector/nvector_kokkos.hpp"
+#include "check_retval.h"
+#include "ParallelGrid.hpp"
+
+/* Set SUNDIALS Kokkos vector shortcut */
+using SUNVector = sundials::kokkos::Vector<ExecSpace>;
+
+using sundials_tools::ParallelGrid;
+using sundials_tools::BoundaryType;
+using sundials_tools::StencilType;
+using std::string;
+
+/* Maximum size of output directory string */
+constexpr int MXSTR = 2048;
+
+/*
+ * Data structure for problem options
+ */
+
+struct UserOptions
+{
+  int      npxyz[3]; /* number of processors in x,y,z */
+  sunindextype npts; /* number of spatial mesh points */
+  realtype t0;       /* initial time                  */
+  realtype tf;       /* final time                    */
+  realtype rtol;     /* relative tolerance            */
+  realtype atol;     /* absolute tolerance            */
+  int      order;    /* method order                  */
+  string   method;   /* method string                 */
+  string   nls;      /* nonlinear solver to use       */
+  int      fpaccel;  /* number of fixedpoint vectors  */
+  int      precond;  /* to precondition or not        */
+  int      fused;    /* use fused vector ops          */
+  int      nout;     /* number of outputs             */
+  int      save;     /* save solution to disk         */
+  char*    outputdir;
+};
+
+
+/*
+ * Data structure for problem specific data
+ */
+
+struct UserData
+{
+  SUNContext ctx;
+  SUNProfiler prof;
+
+  /* MPI data */
+  MPI_Comm    comm;
+  int         myid;
+  int         nprocs;
+  MPI_Request req[2];
+
+  /* Should reactions be added to the advection or not */
+  bool add_reactions;
+
+  /* File handles for output */
+  FILE*  TFID;     /* time output file pointer     */
+  FILE*  UFID;     /* solution output file pointer */
+  FILE*  VFID;
+  FILE*  WFID;
+
+  /* Solution masks */
+  N_Vector umask;
+  N_Vector vmask;
+  N_Vector wmask;
+
+  /* Problem parameters */
+  realtype  xmax; /* maximum x value              */
+  realtype  A;    /* concentration of species A   */
+  realtype  B;    /* w source rate                */
+  realtype  k1;   /* reaction rates               */
+  realtype  k2;
+  realtype  k3;
+  realtype  k4;
+  realtype  k5;
+  realtype  k6;
+  realtype  c;    /* advection coefficient        */
+
+  /* Parallel mesh */
+  ParallelGrid<sunindextype>* grid;
+
+  /* Count of implicit function evals by the task local nonlinear solver */
+  long int nnlfi;
+
+  /* Integrator options */
+  UserOptions* uopt;
+
+  /* Constructor that takes the context */
+  UserData(SUNContext ctx)
+    : ctx(ctx), umask(nullptr), vmask(nullptr), wmask(nullptr), uopt(nullptr),
+      TFID(nullptr), UFID(nullptr), VFID(nullptr), WFID(nullptr)
+  {
+    SUNContext_GetProfiler(ctx, &prof);
+  }
+
+  /* destructor frees the problem data */
+  ~UserData();
+};
+
+
+/*
+ * Functions to evolve the solution (defined by the drivers)
+ */
+
+/* function that does ARKStep setup and evolves the solution with a DIRK method */
+extern int EvolveProblemDIRK(N_Vector y, UserData* udata, UserOptions* uopt);
+
+/* function that does ARKStep setup and evolves the solution with an IMEX method */
+extern int EvolveProblemIMEX(N_Vector y, UserData* udata, UserOptions* uopt);
+
+/* function that does ERKStep setup and evolves the solution */
+extern int EvolveProblemExplicit(N_Vector y, UserData* udata, UserOptions* uopt);
+
+/* function that does CVODE BDF setup and evolves the solution */
+extern int EvolveProblemBDF(N_Vector y, UserData* udata, UserOptions* uopt);
+
+/* function that does CVODE Adams setup and evolves the solution */
+extern int EvolveProblemAdams(N_Vector y, UserData* udata, UserOptions* uopt);
+
+/* function that does IDA BDF setup and evolves the solution */
+extern int EvolveDAEProblem(N_Vector y, UserData* udata, UserOptions* uopt);
+
+
+/*
+ * Helper functions
+ */
+
+/* function to set initial condition */
+int SetIC(N_Vector y, UserData* udata);
+
+/* function to fill neighbor data */
+int FillSendBuffers(N_Vector y, UserData* udata);
+
+/* functions for processing command line args */
+int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
+                 SUNContext ctx);
+void InputError(char *name);
+int ComponentMask(N_Vector mask, const int component, const UserData* udata);
+
+/* function to write solution to disk */
+int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt);
+
+#endif
diff --git a/benchmarks/advection_reaction_3D/arkode_driver.cpp b/benchmarks/advection_reaction_3D/kokkos/arkode_driver.cpp
similarity index 98%
rename from benchmarks/advection_reaction_3D/arkode_driver.cpp
rename to benchmarks/advection_reaction_3D/kokkos/arkode_driver.cpp
index bbea07956a..e2cf1451e3 100644
--- a/benchmarks/advection_reaction_3D/arkode_driver.cpp
+++ b/benchmarks/advection_reaction_3D/kokkos/arkode_driver.cpp
@@ -588,10 +588,7 @@ int TaskLocalLSolve(N_Vector delta, void* arkode_mem)
   SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
 
   /* set up I - gamma*J and solve */
-  auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
-                                RAJA::RangeSegment(0, udata->grid->nyl),
-                                RAJA::RangeSegment(0, udata->grid->nzl));
-  retval = SolveReactionLinSys(z, delta, delta, gamma, range, udata);
+  retval = SolveReactionLinSys(z, delta, delta, gamma, udata);
 
 
   return(retval);
diff --git a/benchmarks/advection_reaction_3D/check_retval.h b/benchmarks/advection_reaction_3D/kokkos/check_retval.h
similarity index 99%
rename from benchmarks/advection_reaction_3D/check_retval.h
rename to benchmarks/advection_reaction_3D/kokkos/check_retval.h
index 31a4fa5922..887b7cea5d 100644
--- a/benchmarks/advection_reaction_3D/check_retval.h
+++ b/benchmarks/advection_reaction_3D/kokkos/check_retval.h
@@ -54,4 +54,4 @@ static int check_retval(void *returnvalue, const char *funcname, int opt, int my
   return(0);
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/benchmarks/advection_reaction_3D/cvode_driver.cpp b/benchmarks/advection_reaction_3D/kokkos/cvode_driver.cpp
similarity index 100%
rename from benchmarks/advection_reaction_3D/cvode_driver.cpp
rename to benchmarks/advection_reaction_3D/kokkos/cvode_driver.cpp
diff --git a/benchmarks/advection_reaction_3D/ida_driver.cpp b/benchmarks/advection_reaction_3D/kokkos/ida_driver.cpp
similarity index 100%
rename from benchmarks/advection_reaction_3D/ida_driver.cpp
rename to benchmarks/advection_reaction_3D/kokkos/ida_driver.cpp
diff --git a/benchmarks/advection_reaction_3D/kokkos/rhs3D.hpp b/benchmarks/advection_reaction_3D/kokkos/rhs3D.hpp
new file mode 100644
index 0000000000..34698146ab
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/kokkos/rhs3D.hpp
@@ -0,0 +1,540 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Daniel R. Reynolds @ SMU
+ *                David J. Gardner, Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------*/
+
+#ifndef ADVECTION_REACTION_3D_RHS_HPP
+#define ADVECTION_REACTION_3D_RHS_HPP
+
+#include "advection_reaction_3D.hpp"
+
+/* --------------------------------------------------------------
+ * Right hand side (RHS) and residual functions
+ * --------------------------------------------------------------*/
+
+/* Compute the advection term f(t,y) = -c (grad * y). This is done using
+   upwind 1st order finite differences.  At present, only periodic boudary
+   conditions are supported, which are handled via MPI's Cartesian
+   communicator (even for serial runs). */
+static int Advection(realtype t, N_Vector y, N_Vector ydot, void* user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* set variable shortcuts */
+  const int      nxl = udata->grid->nxl;
+  const int      nyl = udata->grid->nyl;
+  const int      nzl = udata->grid->nzl;
+  const int      dof = udata->grid->dof;
+  const realtype c   = udata->c;
+  const realtype cx  = -c / udata->grid->dx;
+  const realtype cy  = -c / udata->grid->dy;
+  const realtype cz  = -c / udata->grid->dz;
+
+  /* local variables */
+  int retval;
+
+  /* fill send buffers and begin exchanging boundary information */
+  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
+  retval = FillSendBuffers(y, udata);
+  if (check_retval(&retval, "FillSendBuffers", 1, udata->myid))
+    return(-1);
+  retval = udata->grid->ExchangeStart();
+  if (check_retval(&retval, "ExchangeStart", 1, udata->myid))
+    return(-1);
+  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
+
+  /* set output to zero */
+  N_VConst(0.0, ydot);
+
+  /* create 4D views of the state and RHS vectors */
+  Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+  Vec4D dYview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(ydot)), nxl, nyl, nzl, dof);
+
+  /* iterate over domain interior, computing advection */
+  if (c > 0.0)
+  {
+    /* flow moving in the positive x,y,z direction */
+    Kokkos::parallel_for("AdvectionInteriorRight",
+                         Range3D({1,1,1},{nxl,nyl,nzl}),
+                         KOKKOS_LAMBDA (int i, int j, int k)
+    {
+      const realtype u_ijk = Yview(i,j,k,0);
+      const realtype v_ijk = Yview(i,j,k,1);
+      const realtype w_ijk = Yview(i,j,k,2);
+
+      // grad * u
+      dYview(i,j,k,0)  = cz * (u_ijk - Yview(i,j,k-1,0)); // du/dz
+      dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j-1,k,0)); // du/dy
+      dYview(i,j,k,0) += cx * (u_ijk - Yview(i-1,j,k,0)); // du/dx
+
+      // grad * v
+      dYview(i,j,k,1)  = cz * (v_ijk - Yview(i,j,k-1,1)); // dv/dz
+      dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j-1,k,1)); // dv/dy
+      dYview(i,j,k,1) += cx * (v_ijk - Yview(i-1,j,k,1)); // dv/dx
+
+      // grad * w
+      dYview(i,j,k,2)  = cz * (w_ijk - Yview(i,j,k-1,2)); // dw/dz
+      dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j-1,k,2)); // dw/dy
+      dYview(i,j,k,2) += cx * (w_ijk - Yview(i-1,j,k,2)); // dw/dx
+    });
+  }
+  else if (c < 0.0)
+  {
+    /* flow moving in the negative x,y,z direction */
+    Kokkos::parallel_for("AdvectionInteriorLeft",
+                         Range3D({0,0,0},{nxl-1,nyl-1,nzl-1}),
+                         KOKKOS_LAMBDA (int i, int j, int k)
+    {
+      const realtype u_ijk = Yview(i,j,k,0);
+      const realtype v_ijk = Yview(i,j,k,1);
+      const realtype w_ijk = Yview(i,j,k,2);
+
+      // grad * u
+      dYview(i,j,k,0)  = cz * (Yview(i,j,k+1,0) - u_ijk); // du/dz
+      dYview(i,j,k,0) += cy * (Yview(i,j+1,k,0) - u_ijk); // du/dy
+      dYview(i,j,k,0) += cx * (Yview(i+1,j,k,0) - u_ijk); // du/dx
+
+      // grad * v
+      dYview(i,j,k,1)  = cz * (Yview(i,j,k+1,1) - v_ijk); // dv/dz
+      dYview(i,j,k,1) += cy * (Yview(i,j+1,k,1) - v_ijk); // dv/dy
+      dYview(i,j,k,1) += cx * (Yview(i+1,j,k,1) - v_ijk); // dv/dx
+
+      // grad * w
+      dYview(i,j,k,2)  = cz * (Yview(i,j,k+1,2) - w_ijk); // dw/dz
+      dYview(i,j,k,2) += cy * (Yview(i,j+1,k,2) - w_ijk); // dw/dy
+      dYview(i,j,k,2) += cx * (Yview(i+1,j,k,2) - w_ijk); // dw/dx
+    });
+  }
+
+  /* finish exchanging boundary information */
+  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
+  retval = udata->grid->ExchangeEnd();
+  if (check_retval(&retval, "ExchangeEnd", 1, udata->myid))
+    return(-1);
+  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
+
+  /* compute advection at process boundaries */
+  if (c > 0.0)
+  {
+    /* Flow moving in the positive x,y,z direction:
+       boundaries are west face, south face, and back face */
+
+    /*   Create 4D views of receive buffers */
+    Vec4D Wrecv(udata->grid->GetRecvView("WEST"),  1, nyl, nzl, dof);
+    Vec4D Srecv(udata->grid->GetRecvView("SOUTH"), nxl, 1, nzl, dof);
+    Vec4D Brecv(udata->grid->GetRecvView("BACK"),  nxl, nyl, 1, dof);
+
+    /*   Perform calculations on each "lower" face */
+    Kokkos::parallel_for("AdvectionBoundaryWest",
+                         Range3D({0,0,0},{nyl,nzl,dof}),
+                         KOKKOS_LAMBDA (int j, int k, int l)
+    {
+      const int i = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,0,k,l);
+      const realtype YBack  = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,0,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - Wrecv(0,j,k,l)); // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - YSouth);         // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - YBack);          // d/dz
+    });
+    Kokkos::parallel_for("AdvectionBoundarySouth",
+                         Range3D({0,0,0},{nxl,nzl,dof}),
+                         KOKKOS_LAMBDA (int i, int k, int l)
+    {
+      const int j = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YWest = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(0,j,k,l);
+      const realtype YBack = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,0,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - YWest);          // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - Srecv(i,0,k,l)); // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - YBack);          // d/dz
+    });
+    Kokkos::parallel_for("AdvectionBoundaryBack",
+                         Range3D({0,0,0},{nxl,nyl,dof}),
+                         KOKKOS_LAMBDA (int i, int j, int l)
+    {
+      const int k = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YWest  = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(0,j,k,l);
+      const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,0,k,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - YWest);          // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - YSouth);         // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - Brecv(i,j,0,l)); // d/dz
+    });
+
+  }
+  else if (c < 0.0)
+  {
+
+    /* Flow moving in the negative x,y,z direction:
+       boundaries are east face, north face, and front face */
+
+    /*   Create 4D views of receive buffers */
+    Vec4D Erecv(udata->grid->GetRecvView("EAST"),  1, nyl, nzl, dof);
+    Vec4D Nrecv(udata->grid->GetRecvView("NORTH"), nxl, 1, nzl, dof);
+    Vec4D Frecv(udata->grid->GetRecvView("FRONT"), nxl, nyl, 1, dof);
+
+    /*   Perform calculations on each "upper" face */
+    Kokkos::parallel_for("AdvectionBoundaryEast",
+                         Range3D({0,0,0},{nyl,nzl,dof}),
+                         KOKKOS_LAMBDA (int j, int k, int l)
+    {
+      const int i = nxl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,0,k,l);
+      const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,0,l);
+      dYview(i,j,k,l)  = cx * (Erecv(0,j,k,l) - Yijkl); // d/dx
+      dYview(i,j,k,l) += cy * (YNorth - Yijkl);         // d/dy
+      dYview(i,j,k,l) += cz * (YFront - Yijkl);         // d/dz
+    });
+    Kokkos::parallel_for("AdvectionBoundaryNorth",
+                         Range3D({0,0,0},{nxl,nzl,dof}),
+                         KOKKOS_LAMBDA (int i, int k, int l)
+    {
+      const int j = nyl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YEast  = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(0,j,k,l);
+      const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,0,l);
+      dYview(i,j,k,l)  = cx * (YEast - Yijkl);          // d/dx
+      dYview(i,j,k,l) += cy * (Nrecv(i,0,k,l) - Yijkl); // d/dy
+      dYview(i,j,k,l) += cz * (YFront - Yijkl);         // d/dz
+    });
+    Kokkos::parallel_for("AdvectionBoundaryFront",
+                         Range3D({0,0,0},{nxl,nyl,dof}),
+                         KOKKOS_LAMBDA (int i, int j, int l)
+    {
+      const int k = nzl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YEast  = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(0,j,k,l);
+      const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,0,k,l);
+      dYview(i,j,k,l)  = cx * (YEast - Yijkl);          // d/dx
+      dYview(i,j,k,l) += cy * (YNorth - Yijkl);         // d/dy
+      dYview(i,j,k,l) += cz * (Frecv(i,j,0,l) - Yijkl); // d/dz
+    });
+  }
+
+  /* return success */
+  return(0);
+}
+
+
+/* Compute the reaction term g(t,y). */
+static int Reaction(realtype t, N_Vector y, N_Vector ydot, void* user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* set variable shortcuts */
+  const realtype A  = udata->A;
+  const realtype B  = udata->B;
+  const realtype k1 = udata->k1;
+  const realtype k2 = udata->k2;
+  const realtype k3 = udata->k3;
+  const realtype k4 = udata->k4;
+  const realtype k5 = udata->k5;
+  const realtype k6 = udata->k6;
+  const int     nxl = udata->grid->nxl;
+  const int     nyl = udata->grid->nyl;
+  const int     nzl = udata->grid->nzl;
+  const int     dof = udata->grid->dof;
+
+  /* Zero output if not adding reactions to existing RHS */
+  if (!udata->add_reactions)
+    N_VConst(0.0, ydot);
+
+  /* create 4D views of state and RHS vectors */
+  Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+  Vec4D dYview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(ydot)), nxl, nyl, nzl, dof);
+
+  /* add reaction terms to RHS */
+  Kokkos::parallel_for("ReactionRHS",
+                       Range3D({0,0,0},{nxl,nyl,nzl}),
+                       KOKKOS_LAMBDA (int i, int j, int k)
+  {
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+    dYview(i,j,k,0) += k1 * A - k2 * w * u + k3 * u * u * v - k4 * u;
+    dYview(i,j,k,1) += k2 * w * u - k3 * u * u * v;
+    dYview(i,j,k,2) += -k2 * w * u + k5 * B - k6 * w;
+  });
+
+  /* return success */
+  return(0);
+}
+
+
+/* Compute the RHS as h(t,y) = f(t,y) + g(t,y). */
+static int AdvectionReaction(realtype t, N_Vector y, N_Vector ydot,
+                             void *user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+  int retval;
+
+  /* NOTE: The order in which Advection and Reaction are called
+           is critical here. Advection must be computed first. */
+  retval = Advection(t, y, ydot, user_data);
+  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
+
+  retval = Reaction(t, y, ydot, user_data);
+  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
+
+  /* return success */
+  return(0);
+}
+
+/* Compute the residual F(t,y,y') = ydot - h(t,y) = 0. */
+static int AdvectionReactionResidual(realtype t, N_Vector y, N_Vector ydot,
+                                     N_Vector F, void *user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+  int retval;
+
+  /* NOTE: The order in which Advection and Reaction are called
+           is critical here. Advection must be computed first. */
+  retval = Advection(t, y, F, user_data); /* F = -c y_x */
+  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
+
+  retval = Reaction(t, y, F, user_data);  /* F = -c y_x + g(t,y) */
+  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
+
+  /* F = ydot - h(t,y) = ydot + c y_x - g(t,y) */
+  N_VLinearSum(1.0, ydot, -1.0, F, F);
+
+  /* return success */
+  return(0);
+}
+
+/* --------------------------------------------------------------
+ * Linear system and Jacobian functions
+ * --------------------------------------------------------------*/
+
+/* Solve the linear systems Ax = b where A = I - gamma*dg/dy.
+   When using a fully implicit method, we are approximating
+   dh/dy as dg/dy. */
+static int SolveReactionLinSys(N_Vector y, N_Vector x, N_Vector b,
+                               const realtype gamma, UserData* udata)
+{
+  /* set variable shortcuts */
+  const int dof = udata->grid->dof;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const realtype k2  = udata->k2;
+  const realtype k3  = udata->k3;
+  const realtype k4  = udata->k4;
+  const realtype k6  = udata->k6;
+
+  /* create 4D views of state, RHS and solution vectors */
+  Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+  Vec4D Bview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(b)), nxl, nyl, nzl, dof);
+  Vec4D Xview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(x)), nxl, nyl, nzl, dof);
+
+  /* solve reaction linear system */
+  Kokkos::parallel_for("SolveReactionLinSys",
+                       Range3D({0,0,0},{nxl,nyl,nzl}),
+                       KOKKOS_LAMBDA (int i, int j, int k)
+  {
+
+    /* shortcuts to u, v, w for the block */
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+
+    //
+    // compute A = I - gamma*(dg/dy)
+    //
+
+    /* 1st row: u, v, w */
+    const realtype A0 = 1. - gamma * (-k2 * w + 2.0 * k3 * u * v - k4);
+    const realtype A1 = -gamma * (k3 * u * u);
+    const realtype A2 = -gamma * (-k2 * u);
+
+    /* 2nd row: u, v, w */
+    const realtype A3 = -gamma * (k2 * w - 2.0 * k3 * u * v);
+    const realtype A4 = 1. - gamma * (-k3 * u * u);
+    const realtype A5 = -gamma * (k2 * u);
+
+    /* 3rd row: u, v, w */
+    const realtype A6 = -gamma * (-k2 * w);
+    const realtype A7 =  0.0;
+    const realtype A8 = 1. - gamma * (-k2 * u - k6);
+
+    //
+    // compute x = A^{-1}*b
+    //
+
+    const realtype scratch_0 = A4*A8;
+    const realtype scratch_1 = A1*A5;
+    const realtype scratch_2 = A2*A7;
+    const realtype scratch_3 = A5*A7;
+    const realtype scratch_4 = A1*A8;
+    const realtype scratch_5 = A2*A4;
+    const realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
+    const realtype scratch_7 = A2*A3;
+    const realtype scratch_8 = A6*Bview(i,j,k,0);
+    const realtype scratch_9 = A2*A6;
+    const realtype scratch_10 = A3*Bview(i,j,k,0);
+    const realtype scratch_11 = 1.0/A0;
+    const realtype scratch_12 = A1*scratch_11;
+    const realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
+
+    Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3)
+                               + Bview(i,j,k,1)*(scratch_2 - scratch_4)
+                               + Bview(i,j,k,2)*(scratch_1 - scratch_5));
+    Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5)
+                               + Bview(i,j,k,1)*(A0*A8 - scratch_9)
+                               + A5*scratch_8 - A8*scratch_10 );
+    Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8
+                     + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) /
+                     (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
+
+  });
+
+  return(0);
+}
+
+/* Solve the linear systems Ax = b where A = -dg/dy + gamma.
+   We are approximating dh/dy as dg/dy. */
+static int SolveReactionLinSysRes(N_Vector y, N_Vector x, N_Vector b,
+                                  const realtype gamma, UserData* udata)
+{
+  /* set variable shortcuts */
+  const int dof = udata->grid->dof;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const realtype k2  = udata->k2;
+  const realtype k3  = udata->k3;
+  const realtype k4  = udata->k4;
+  const realtype k6  = udata->k6;
+
+  /* create 4D views of state, RHS and solution vectors */
+  Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof);
+  Vec4D Bview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(b)), nxl, nyl, nzl, dof);
+  Vec4D Xview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(x)), nxl, nyl, nzl, dof);
+
+  /* solve reaction linear system */
+  Kokkos::parallel_for("SolveReactionLinSys",
+                       Range3D({0,0,0},{nxl,nyl,nzl}),
+                       KOKKOS_LAMBDA (int i, int j, int k)
+  {
+
+    /* shortcuts to u, v, w for the block */
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+
+    //
+    // compute A = -dg/dy + gamma*diag(df/dydot)
+    // where diag(df/dydot) is approximated as
+    // diag([udot, vdot, wdot])
+    //
+
+    /* 1st row: u, v, w */
+    const realtype A0 = -(-k2 * w + 2.0 * k3 * u * v - k4) + gamma;
+    const realtype A1 = -(k3 * u * u);
+    const realtype A2 = -(-k2 * u);
+
+    /* 2nd row: u, v, w */
+    const realtype A3 = -(k2 * w - 2.0 * k3 * u * v);
+    const realtype A4 = -(-k3 * u * u) + gamma;
+    const realtype A5 = -(k2 * u);
+
+    /* 3rd row: u, v, w */
+    const realtype A6 = -(-k2 * w);
+    const realtype A7 =  0.0;
+    const realtype A8 = -(-k2 * u - k6) + gamma;
+
+    //
+    // compute x = A^{-1}*b
+    //
+
+    const realtype scratch_0 = A4*A8;
+    const realtype scratch_1 = A1*A5;
+    const realtype scratch_2 = A2*A7;
+    const realtype scratch_3 = A5*A7;
+    const realtype scratch_4 = A1*A8;
+    const realtype scratch_5 = A2*A4;
+    const realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
+    const realtype scratch_7 = A2*A3;
+    const realtype scratch_8 = A6*Bview(i,j,k,0);
+    const realtype scratch_9 = A2*A6;
+    const realtype scratch_10 = A3*Bview(i,j,k,0);
+    const realtype scratch_11 = 1.0/A0;
+    const realtype scratch_12 = A1*scratch_11;
+    const realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
+
+    Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3)
+                               + Bview(i,j,k,1)*(scratch_2 - scratch_4)
+                               + Bview(i,j,k,2)*(scratch_1 - scratch_5));
+    Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5)
+                               + Bview(i,j,k,1)*(A0*A8 - scratch_9)
+                               + A5*scratch_8 - A8*scratch_10 );
+    Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8
+                     + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) /
+                     (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
+
+  });
+
+  return(0);
+}
+
+
+/* --------------------------------------------------------------
+ * Preconditioner functions
+ * --------------------------------------------------------------*/
+
+/* Solves Pz = r where P = I - gamma * dg/dy */
+static int PSolve(realtype t, N_Vector y, N_Vector ydot, N_Vector r,
+                  N_Vector z, realtype gamma, realtype delta, int lr,
+                  void *user_data)
+{
+  /* local variables */
+  UserData* udata = (UserData*) user_data;
+  int       retval;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* solve the task-local linear system Pz = r */
+  retval = SolveReactionLinSys(y, z, r, gamma, udata);
+
+  return(retval);
+}
+
+/* Solves Pz = r where P = -dg/dy + gamma */
+static int PSolveRes(realtype t, N_Vector y, N_Vector ydot, N_Vector F,
+                     N_Vector r, N_Vector z, realtype cj, realtype delta,
+                     void *user_data)
+{
+  /* local variables */
+  UserData* udata = (UserData*) user_data;
+  int       retval;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* solve the task-local linear system Pz = r */
+  retval = SolveReactionLinSysRes(y, z, r, cj, udata);
+
+  return(retval);
+}
+
+
+#endif
diff --git a/benchmarks/advection_reaction_3D/raja/CMakeLists.txt b/benchmarks/advection_reaction_3D/raja/CMakeLists.txt
new file mode 100644
index 0000000000..0bae78c562
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/CMakeLists.txt
@@ -0,0 +1,151 @@
+# ---------------------------------------------------------------
+# Programmer(s):  Cody J. Balos @ LLNL
+#                 Daniel R. Reynolds @ SMU
+# ---------------------------------------------------------------
+# SUNDIALS Copyright Start
+# Copyright (c) 2002-2023, Lawrence Livermore National Security
+# and Southern Methodist University.
+# All rights reserved.
+#
+# See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# SUNDIALS Copyright End
+# ---------------------------------------------------------------
+
+if(BUILD_ARKODE AND BUILD_CVODE AND BUILD_IDA)
+
+  if((RAJA_BACKENDS MATCHES "TARGET_OPENMP") OR (RAJA_BACKENDS MATCHES "OPENMP"))
+    set(OTHER_LIBS OpenMP::OpenMP_CXX)
+  endif()
+
+  # ----------------------------------------------------------------------------
+  # MPI only
+  # ----------------------------------------------------------------------------
+
+  add_executable(advection_reaction_3D_raja
+    advection_reaction_3D.cpp
+    arkode_driver.cpp
+    cvode_driver.cpp
+    ida_driver.cpp
+    rhs3D.hpp
+    ParallelGrid.hpp
+    check_retval.h
+    backends.hpp)
+
+  # ensure the linker language is reset to CXX
+  set_target_properties(advection_reaction_3D_raja PROPERTIES LINKER_LANGUAGE CXX)
+
+  target_include_directories(advection_reaction_3D_raja
+    PRIVATE
+    ${PROJECT_SOURCE_DIR}/utilities
+    ${MPI_CXX_INCLUDE_DIRS})
+
+  target_link_libraries(advection_reaction_3D_raja
+    PRIVATE
+    sundials_arkode
+    sundials_cvode
+    sundials_ida
+    sundials_nvecmpiplusx
+    sundials_nvecserial
+    RAJA
+    ${MPI_CXX_LIBRARIES}
+    ${OTHER_LIBS})
+
+  install(TARGETS advection_reaction_3D_raja
+    DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja")
+
+  install(FILES README.md ../scripts/compare_error.py ../scripts/compute_error.py ../scripts/pickle_solution_output.py
+    DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja")
+
+  # ----------------------------------------------------------------------------
+  # MPI + CUDA
+  # ----------------------------------------------------------------------------
+
+  if(BUILD_NVECTOR_CUDA)
+
+    set_source_files_properties(advection_reaction_3D.cpp
+      PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(arkode_driver.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(cvode_driver.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(ida_driver.cpp PROPERTIES LANGUAGE CUDA)
+
+    add_executable(advection_reaction_3D_raja_mpicuda
+      advection_reaction_3D.cpp
+      arkode_driver.cpp
+      cvode_driver.cpp
+      ida_driver.cpp
+      rhs3D.hpp
+      ParallelGrid.hpp
+      check_retval.h
+      backends.hpp)
+
+    # ensure the linker language is reset to CXX
+    set_target_properties(advection_reaction_3D_raja_mpicuda
+      PROPERTIES LINKER_LANGUAGE CXX)
+
+    target_include_directories(advection_reaction_3D_raja_mpicuda
+      PRIVATE
+      ${PROJECT_SOURCE_DIR}/utilities
+      ${MPI_CXX_INCLUDE_DIRS})
+
+    target_link_libraries(advection_reaction_3D_raja_mpicuda
+      PRIVATE
+      sundials_arkode
+      sundials_cvode
+      sundials_ida
+      sundials_nvecmpiplusx
+      sundials_nveccuda
+      RAJA
+      ${MPI_CXX_LIBRARIES}
+      ${OTHER_LIBS})
+
+    target_compile_definitions(advection_reaction_3D_raja_mpicuda PRIVATE USE_CUDA_NVEC)
+
+    install(TARGETS advection_reaction_3D_raja_mpicuda
+      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja")
+
+  endif()
+
+  # ----------------------------------------------------------------------------
+  # MPI + HIP
+  # ----------------------------------------------------------------------------
+
+  if(BUILD_NVECTOR_HIP)
+
+    add_executable(advection_reaction_3D_raja_mpihip
+      advection_reaction_3D.cpp
+      advection_reaction_3D.hpp
+      arkode_driver.cpp
+      cvode_driver.cpp
+      ida_driver.cpp
+      rhs3D.hpp
+      ParallelGrid.hpp
+      check_retval.h
+      backends.hpp)
+
+    target_include_directories(advection_reaction_3D_raja_mpihip
+      PRIVATE
+      ${PROJECT_SOURCE_DIR}/utilities
+      ${MPI_CXX_INCLUDE_DIRS})
+
+    target_link_libraries(advection_reaction_3D_raja_mpihip
+      PRIVATE
+      sundials_arkode
+      sundials_cvode
+      sundials_ida
+      sundials_nvecmpiplusx
+      sundials_nvechip
+      RAJA
+      hip::device
+      ${MPI_CXX_LIBRARIES}
+      ${OTHER_LIBS})
+
+    target_compile_definitions(advection_reaction_3D_raja_mpihip PRIVATE USE_HIP_NVEC)
+
+    install(TARGETS advection_reaction_3D_raja_mpihip
+      DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja")
+
+  endif()
+
+endif()
diff --git a/benchmarks/advection_reaction_3D/ParallelGrid.hpp b/benchmarks/advection_reaction_3D/raja/ParallelGrid.hpp
similarity index 56%
rename from benchmarks/advection_reaction_3D/ParallelGrid.hpp
rename to benchmarks/advection_reaction_3D/raja/ParallelGrid.hpp
index abd6185810..1592a27806 100644
--- a/benchmarks/advection_reaction_3D/ParallelGrid.hpp
+++ b/benchmarks/advection_reaction_3D/raja/ParallelGrid.hpp
@@ -1,5 +1,6 @@
 /* -----------------------------------------------------------------------------
  * Programmer(s): Cody J. Balos @ LLNL
+ *                Daniel R. Reynolds @ SMU
  * -----------------------------------------------------------------------------
  * SUNDIALS Copyright Start
  * Copyright (c) 2002-2023, Lawrence Livermore National Security
@@ -40,24 +41,26 @@ enum class StencilType
   UPWIND
 };
 
-template<typename REAL, typename GLOBALINT, int NDIMS>
+template<typename REAL, typename GLOBALINT>
 class ParallelGrid
 {
 public:
   // Constructor that creates a new ParallelGrid object.
   // [in] - the memory helper to use for allocating the MPI buffers
   // [in,out] comm - on input, the overal MPI communicator, on output, the cartesian communicator
-  // [in] a[] - an array of length NDIMS which defines the domain [a,b]
-  // [in] b[] - an array of length NDIMS which defines the domain [a,b]
-  // [in] npts[] - an array of length NDIMS which defines the number of mesh points in each dimension
+  // [in] a[] - an array of length 3 which defines the domain [a,b]
+  // [in] b[] - an array of length 3 which defines the domain [a,b]
+  // [in] npts[] - an array of length 3 which defines the number of mesh points in each dimension
   // [in] dof - the number of degrees of freedom in each dimension
   // [in] bc - the type of boundary conditions (see BoundaryType)
   // [in] st - the stencil to use (see StencilType)
   // [in] width - the stencil width; defaults to 1
   // [in] npxyz - the number of processors in each dimension; defaults to 0 which means MPI will choose
   // [in] reorder - should MPI_Cart_create do process reordering to optimize or not; defaults to false (some MPI implementations ignore this)
-  ParallelGrid(SUNMemoryHelper memhelp, MPI_Comm* comm, const REAL a[], const REAL b[], const GLOBALINT npts[], int dof,
-               BoundaryType bc, StencilType st, int width = 1, const int npxyz[] = nullptr, bool reorder = false)
+  ParallelGrid(SUNMemoryHelper memhelp, MPI_Comm* comm, const REAL a[], const REAL b[],
+               const GLOBALINT npts[], int dof, BoundaryType bc, StencilType st,
+               const REAL c, int width = 1, const int npxyz[] = nullptr,
+               bool reorder = false)
     : nx(1), ny(1), nz(1),
       nxl(1), nyl(1), nzl(1),
       npx(1), npy(1), npz(1),
@@ -66,35 +69,40 @@ class ParallelGrid
       bx(0.0), by(0.0), bz(0.0),
       dof(dof), dims{0,0,0}, coords{0,0,0},
       bc(bc), st(st), width(width),
+      upwindRight(true),
       memhelp(memhelp)
-  {
-    static_assert((NDIMS >= 1 && NDIMS <= 3), "ParallelGrid NDIMS must be 1, 2 or 3");
 
-    int retval, nprocs;
-    int periods[] = {0, 0, 0};
+  {
+    assert(st == StencilType::UPWIND);
 
+    /* Set up MPI Cartesian communicator */
     if (npxyz)
     {
       dims[0] = npxyz[0];
-      if (NDIMS >= 2) dims[1] = npxyz[1];
-      if (NDIMS == 3) dims[2] = npxyz[2];
+      dims[1] = npxyz[1];
+      dims[2] = npxyz[2];
     }
 
+    int retval, nprocs;
     MPI_Comm_size(*comm, &nprocs);
-    retval = MPI_Dims_create(nprocs, NDIMS, dims);
+    retval = MPI_Dims_create(nprocs, 3, dims);
     assert(retval == MPI_SUCCESS);
 
-    periods[0] = bc == BoundaryType::PERIODIC;
-    periods[1] = bc == BoundaryType::PERIODIC;
-    periods[2] = bc == BoundaryType::PERIODIC;
-    retval = MPI_Cart_create(*comm, NDIMS, dims, periods, reorder, comm);
+    int periods[] = { bc == BoundaryType::PERIODIC,
+                      bc == BoundaryType::PERIODIC,
+                      bc == BoundaryType::PERIODIC };
+    retval = MPI_Cart_create(*comm, 3, dims, periods, reorder, comm);
     assert(retval == MPI_SUCCESS);
 
-    retval = MPI_Cart_get(*comm, NDIMS, dims, periods, coords);
+    retval = MPI_Cart_get(*comm, 3, dims, periods, coords);
     assert(retval == MPI_SUCCESS);
 
     cart_comm = *comm;
 
+    /* Set upwinding direction */
+    upwindRight = (c > 0.0);
+
+    /* Set up information for the first spatial dimension */
     npx = dims[0];
     nx  = npts[0];
     ax  = a[0];
@@ -103,251 +111,235 @@ class ParallelGrid
     int is = nx*(coords[0])/npx;
     int ie = nx*(coords[0]+1)/npx-1;
     nxl = ie-is+1;
-
     neq = dof * nxl;
 
-    if (NDIMS >= 2)
-    {
-      npy = dims[1];
-      ny  = npts[1];
-      ay  = a[1];
-      by  = b[1];
-      dy  = (by-ay) / (REAL) ny;
-      int js = ny*(coords[1])/npy;
-      int je = ny*(coords[1]+1)/npy-1;
-      nyl = je-js+1;
-
-      neq *= nyl;
-    }
-
-    if (NDIMS == 3)
-    {
-      npz = dims[2];
-      nz  = npts[2];
-      az  = a[2];
-      bz  = b[2];
-      dz  = (bz-az) / (REAL) nz;
-      int ks = nz*(coords[2])/npz;
-      int ke = nz*(coords[2]+1)/npz-1;
-      nzl = ke-ks+1;
-
-      neq *= nzl;
-    }
-
+    /* Set up information for the second spatial dimension */
+    npy = dims[1];
+    ny  = npts[1];
+    ay  = a[1];
+    by  = b[1];
+    dy  = (by-ay) / (REAL) ny;
+    int js = ny*(coords[1])/npy;
+    int je = ny*(coords[1]+1)/npy-1;
+    nyl = je-js+1;
+    neq *= nyl;
+
+    /* Set up information for the third spatial dimension */
+    npz = dims[2];
+    nz  = npts[2];
+    az  = a[2];
+    bz  = b[2];
+    dz  = (bz-az) / (REAL) nz;
+    int ks = nz*(coords[2])/npz;
+    int ke = nz*(coords[2]+1)/npz-1;
+    nzl = ke-ks+1;
+    neq *= nzl;
+
+    /* Allocate buffers for nearest-neighbor exchange */
     if (st == StencilType::UPWIND)
       AllocateBuffersUpwind();
 
   }
 
   // TODO:
-  //  - does not take advantage of upwind scheme to reduce communications and memory
   //  - support non-periodic boundary conditions
   // For all faces where neighbors exist: determine neighbor process indices.
   // For all faces: allocate exchange buffers.
   void AllocateBuffersUpwind()
   {
-    int retval = 0;
-    int nbcoords[] = {0, 0, 0};
 
-    SUNMemoryHelper_Alloc(memhelp, &Wrecv_, sizeof(REAL)*dof*width*nyl*nzl,
-                          memoryType(), nullptr);
-    SUNMemoryHelper_Alloc(memhelp, &Wsend_, sizeof(REAL)*dof*width*nyl*nzl,
-                          memoryType(), nullptr);
+    /* Allocate send/receive buffers and determine ID for communication West */
+    if (upwindRight)
+      SUNMemoryHelper_Alloc(memhelp, &Wrecv_, sizeof(REAL)*dof*width*nyl*nzl,
+                            memoryType(), nullptr);
+    else
+      SUNMemoryHelper_Alloc(memhelp, &Wsend_, sizeof(REAL)*dof*width*nyl*nzl,
+                            memoryType(), nullptr);
     ipW = MPI_PROC_NULL;
     if ((coords[0] > 0) || (bc == BoundaryType::PERIODIC)) {
-      nbcoords[0] = coords[0]-1;
-      nbcoords[1] = coords[1];
-      nbcoords[2] = coords[2];
-      retval = MPI_Cart_rank(cart_comm, nbcoords, &ipW);
+      int nbcoords[] = {coords[0]-1, coords[1], coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipW);
       assert(retval == MPI_SUCCESS);
     }
 
-    SUNMemoryHelper_Alloc(memhelp, &Erecv_, sizeof(REAL)*dof*width*nyl*nzl,
-                          memoryType(), nullptr);
-    SUNMemoryHelper_Alloc(memhelp, &Esend_, sizeof(REAL)*dof*width*nyl*nzl,
-                          memoryType(), nullptr);
+    /* Allocate send/receive buffers and determine ID for communication East */
+    if (upwindRight)
+      SUNMemoryHelper_Alloc(memhelp, &Esend_, sizeof(REAL)*dof*width*nyl*nzl,
+                            memoryType(), nullptr);
+    else
+      SUNMemoryHelper_Alloc(memhelp, &Erecv_, sizeof(REAL)*dof*width*nyl*nzl,
+                            memoryType(), nullptr);
     ipE = MPI_PROC_NULL;
     if ((coords[0] < dims[0]-1) || (bc == BoundaryType::PERIODIC)) {
-      nbcoords[0] = coords[0]+1;
-      nbcoords[1] = coords[1];
-      nbcoords[2] = coords[2];
-      retval = MPI_Cart_rank(cart_comm, nbcoords, &ipE);
+      int nbcoords[] = {coords[0]+1, coords[1], coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipE);
       assert(retval == MPI_SUCCESS);
     }
 
-    if (NDIMS >= 2)
-    {
+    /* Allocate send/receive buffers and determine ID for communication South */
+    if (upwindRight)
       SUNMemoryHelper_Alloc(memhelp, &Srecv_, sizeof(REAL)*dof*width*nxl*nzl,
                             memoryType(), nullptr);
+    else
       SUNMemoryHelper_Alloc(memhelp, &Ssend_, sizeof(REAL)*dof*width*nxl*nzl,
                             memoryType(), nullptr);
-      ipS = MPI_PROC_NULL;
-      if ((coords[1] > 0) || (bc == BoundaryType::PERIODIC)) {
-        nbcoords[0] = coords[0];
-        nbcoords[1] = coords[1]-1;
-        nbcoords[2] = coords[2];
-        retval = MPI_Cart_rank(cart_comm, nbcoords, &ipS);
-        assert(retval == MPI_SUCCESS);
-      }
+    ipS = MPI_PROC_NULL;
+    if ((coords[1] > 0) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1]-1, coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipS);
+      assert(retval == MPI_SUCCESS);
+    }
 
-      SUNMemoryHelper_Alloc(memhelp, &Nrecv_, sizeof(REAL)*dof*width*nxl*nzl,
-                            memoryType(), nullptr);
+    /* Allocate send/receive buffers and determine ID for communication North */
+    if (upwindRight)
       SUNMemoryHelper_Alloc(memhelp, &Nsend_, sizeof(REAL)*dof*width*nxl*nzl,
                             memoryType(), nullptr);
-      ipN = MPI_PROC_NULL;
-      if ((coords[1] < dims[1]-1) || (bc == BoundaryType::PERIODIC)) {
-        nbcoords[0] = coords[0];
-        nbcoords[1] = coords[1]+1;
-        nbcoords[2] = coords[2];
-        retval = MPI_Cart_rank(cart_comm, nbcoords, &ipN);
-        assert(retval == MPI_SUCCESS);
-      }
+    else
+      SUNMemoryHelper_Alloc(memhelp, &Nrecv_, sizeof(REAL)*dof*width*nxl*nzl,
+                            memoryType(), nullptr);
+    ipN = MPI_PROC_NULL;
+    if ((coords[1] < dims[1]-1) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1]+1, coords[2]};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipN);
+      assert(retval == MPI_SUCCESS);
     }
 
-    if (NDIMS == 3)
-    {
+    /* Allocate send/receive buffers and determine ID for communication Back */
+    if (upwindRight)
       SUNMemoryHelper_Alloc(memhelp, &Brecv_, sizeof(REAL)*dof*width*nxl*nyl,
                             memoryType(), nullptr);
+    else
       SUNMemoryHelper_Alloc(memhelp, &Bsend_, sizeof(REAL)*dof*width*nxl*nyl,
                             memoryType(), nullptr);
-      ipB = MPI_PROC_NULL;
-      if ((coords[2] > 0) || (bc == BoundaryType::PERIODIC)) {
-        nbcoords[0] = coords[0];
-        nbcoords[1] = coords[1];
-        nbcoords[2] = coords[2]-1;
-        retval = MPI_Cart_rank(cart_comm, nbcoords, &ipB);
-        assert(retval == MPI_SUCCESS);
-      }
+    ipB = MPI_PROC_NULL;
+    if ((coords[2] > 0) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1], coords[2]-1};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipB);
+      assert(retval == MPI_SUCCESS);
+    }
 
-      SUNMemoryHelper_Alloc(memhelp, &Frecv_, sizeof(REAL)*dof*width*nxl*nyl,
-                            memoryType(), nullptr);
+    /* Allocate send/receive buffers and determine ID for communication Front */
+    if (upwindRight)
       SUNMemoryHelper_Alloc(memhelp, &Fsend_, sizeof(REAL)*dof*width*nxl*nyl,
                             memoryType(), nullptr);
-      ipF = MPI_PROC_NULL;
-      if ((coords[2] < dims[2]-1) || (bc == BoundaryType::PERIODIC)) {
-        nbcoords[0] = coords[0];
-        nbcoords[1] = coords[1];
-        nbcoords[2] = coords[2]+1;
-        retval = MPI_Cart_rank(cart_comm, nbcoords, &ipF);
-        assert(retval == MPI_SUCCESS);
-      }
+    else
+      SUNMemoryHelper_Alloc(memhelp, &Frecv_, sizeof(REAL)*dof*width*nxl*nyl,
+                            memoryType(), nullptr);
+    ipF = MPI_PROC_NULL;
+    if ((coords[2] < dims[2]-1) || (bc == BoundaryType::PERIODIC)) {
+      int nbcoords[] = {coords[0], coords[1], coords[2]+1};
+      int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipF);
+      assert(retval == MPI_SUCCESS);
     }
 
   }
 
-  // TODO: this could be optimized for upwind
-  int ExchangeStart(std::function<void (REAL*,REAL*,REAL*,REAL*,REAL*,REAL*)> fill)
+  // Initiate non-blocking neighbor communication
+  int ExchangeStart()
   {
     int retval = 0;
+    nreq = 0;
 
     // Initialize all requests in array
     for (int i=0; i<12; i++)
       req[i] = MPI_REQUEST_NULL;
 
     // Open an Irecv buffer for each neighbor
-    if (ipW != MPI_PROC_NULL)
+    if ((ipW != MPI_PROC_NULL) && (upwindRight))
     {
-      retval = MPI_Irecv(getRecvBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW,
-                         1, cart_comm, req);
+      retval = MPI_Irecv(getRecvBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW,
+                         1, cart_comm, req+nreq);
       assert(retval == MPI_SUCCESS);
+      nreq++;
     }
 
-    if (ipE != MPI_PROC_NULL)
+    if ((ipE != MPI_PROC_NULL) && (!upwindRight))
     {
-      retval = MPI_Irecv(getRecvBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE,
-                         0, cart_comm, req+1);
+      retval = MPI_Irecv(getRecvBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE,
+                         0, cart_comm, req+nreq);
       assert(retval == MPI_SUCCESS);
+      nreq++;
     }
 
-    if (NDIMS >= 2)
+    if ((ipS != MPI_PROC_NULL) && (upwindRight))
     {
-      if (ipS != MPI_PROC_NULL)
-      {
-        retval = MPI_Irecv(getRecvBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS,
-                           3, cart_comm, req+2);
-        assert(retval == MPI_SUCCESS);
-      }
-
-      if (ipN != MPI_PROC_NULL)
-      {
-        retval = MPI_Irecv(getRecvBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN,
-                           2, cart_comm, req+3);
-        assert(retval == MPI_SUCCESS);
-      }
-    }
-
-    if (NDIMS >= 3)
+      retval = MPI_Irecv(getRecvBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS,
+                         3, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipN != MPI_PROC_NULL) && (!upwindRight))
+    {
+      retval = MPI_Irecv(getRecvBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN,
+                         2, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipB != MPI_PROC_NULL) && (upwindRight))
+    {
+      retval = MPI_Irecv(getRecvBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB,
+                         5, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipF != MPI_PROC_NULL) && (!upwindRight))
     {
-      if (ipB != MPI_PROC_NULL)
-      {
-        retval = MPI_Irecv(getRecvBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB,
-                           5, cart_comm, req+4);
-        assert(retval == MPI_SUCCESS);
-      }
-
-      if (ipF != MPI_PROC_NULL)
-      {
-        retval = MPI_Irecv(getRecvBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF,
-                           4, cart_comm, req+5);
-        assert(retval == MPI_SUCCESS);
-      }
-    }
-
-    // Call user lambda to fill the send buffers
-    fill(getSendBuffer("WEST"),
-         getSendBuffer("EAST"),
-         getSendBuffer("SOUTH"),
-         getSendBuffer("NORTH"),
-         getSendBuffer("BACK"),
-         getSendBuffer("FRONT"));
+      retval = MPI_Irecv(getRecvBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF,
+                         4, cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
 
     // Send data to neighbors
-    if (ipW != MPI_PROC_NULL)
+    if ((ipW != MPI_PROC_NULL) && (!upwindRight))
     {
-      retval = MPI_Isend(getSendBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, 0,
-                         cart_comm, req+6);
+      retval = MPI_Isend(getSendBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, 0,
+                         cart_comm, req+nreq);
       assert(retval == MPI_SUCCESS);
+      nreq++;
     }
 
-    if (ipE != MPI_PROC_NULL)
+    if ((ipE != MPI_PROC_NULL) && (upwindRight))
     {
-      retval = MPI_Isend(getSendBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, 1,
-                         cart_comm, req+7);
+      retval = MPI_Isend(getSendBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, 1,
+                         cart_comm, req+nreq);
       assert(retval == MPI_SUCCESS);
+      nreq++;
     }
 
-    if (NDIMS >= 2)
+    if ((ipS != MPI_PROC_NULL) && (!upwindRight))
     {
-      if (ipS != MPI_PROC_NULL)
-      {
-        retval = MPI_Isend(getSendBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, 2,
-                           cart_comm, req+8);
-        assert(retval == MPI_SUCCESS);
-      }
-
-      if (ipN != MPI_PROC_NULL)
-      {
-        retval = MPI_Isend(getSendBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, 3,
-                           cart_comm, req+9);
-        assert(retval == MPI_SUCCESS);
-      }
-    }
-
-    if (NDIMS == 3)
+      retval = MPI_Isend(getSendBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, 2,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipN != MPI_PROC_NULL) && (upwindRight))
     {
-      if (ipB != MPI_PROC_NULL)
-      {
-        retval = MPI_Isend(getSendBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, 4,
-                           cart_comm, req+10);
-        assert(retval == MPI_SUCCESS);
-      }
+      retval = MPI_Isend(getSendBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, 3,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
 
-      if (ipF != MPI_PROC_NULL)
-      {
-        retval = MPI_Isend(getSendBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, 5,
-                           cart_comm, req+11);
-        assert(retval == MPI_SUCCESS);
-      }
+    if ((ipB != MPI_PROC_NULL) && (!upwindRight))
+    {
+      retval = MPI_Isend(getSendBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, 4,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
+    }
+
+    if ((ipF != MPI_PROC_NULL) && (upwindRight))
+    {
+      retval = MPI_Isend(getSendBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, 5,
+                         cart_comm, req+nreq);
+      assert(retval == MPI_SUCCESS);
+      nreq++;
     }
 
     return retval;
@@ -359,8 +351,12 @@ class ParallelGrid
     MPI_Status stat[12];
     int retval;
 
+    // return automatically with success if there are no outstanding requests
+    if (nreq == 0)
+      return(0);
+
     // Wait for messages to finish send/receive
-    retval = MPI_Waitall(12, req, stat);
+    retval = MPI_Waitall(nreq, req, stat);
     assert(retval == MPI_SUCCESS);
 
     return retval;
@@ -370,12 +366,16 @@ class ParallelGrid
   void PrintInfo()
   {
     printf("ParallelGrid Info:\n");
-    printf("    dimensions = %d\n", NDIMS);
+    printf("    dimensions = %d\n", 3);
     printf("    processors = {%d, %d, %d}\n", npx, npy, npz);
     printf("        domain = {[%g,%g], [%g,%g], [%g,%g]}\n", ax, bx, ay, by, az, bz);
     printf("   global npts = {%li, %li, %li}\n", (long int) nx, (long int) ny, (long int) nz);
     printf("    local npts = {%d, %d, %d}\n", nxl, nyl, nzl);
     printf("  mesh spacing = {%g, %g, %g}\n", dx, dy, dz);
+    if (upwindRight)
+      printf("    upwind dir = right\n");
+    else
+      printf("    upwind dir = left\n");
   }
 
   // Saves the mesh to a file.
@@ -407,16 +407,12 @@ class ParallelGrid
 
   GLOBALINT npts() const
   {
-    if (NDIMS == 1) return nx;
-    if (NDIMS == 2) return nx*ny;
-    if (NDIMS == 3) return nx*ny*nz;
+    return nx*ny*nz;
   }
 
   GLOBALINT nptsl() const
   {
-    if (NDIMS == 1) return nxl;
-    if (NDIMS == 2) return nxl*nyl;
-    if (NDIMS == 3) return nxl*nyl*nzl;
+    return nxl*nyl*nzl;
   }
 
   GLOBALINT neql() const
@@ -452,6 +448,7 @@ class ParallelGrid
     }
     else
     {
+      assert(direction == "ILLEGAL");
       return nullptr;
     }
   }
@@ -484,24 +481,28 @@ class ParallelGrid
     }
     else
     {
+      assert(direction == "ILLEGAL");
       return nullptr;
     }
   }
 
   ~ParallelGrid()
   {
-    SUNMemoryHelper_Dealloc(memhelp, Esend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Wsend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Nsend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Ssend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Fsend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Bsend_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Erecv_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Wrecv_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Nrecv_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Srecv_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Frecv_, nullptr);
-    SUNMemoryHelper_Dealloc(memhelp, Brecv_, nullptr);
+    if (upwindRight) {
+      SUNMemoryHelper_Dealloc(memhelp, Esend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Nsend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Fsend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Wrecv_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Srecv_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Brecv_, nullptr);
+    } else {
+      SUNMemoryHelper_Dealloc(memhelp, Wsend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Ssend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Bsend_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Erecv_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Nrecv_, nullptr);
+      SUNMemoryHelper_Dealloc(memhelp, Frecv_, nullptr);
+    }
   }
 
   GLOBALINT nx, ny, nz;    /* number of intervals globally       */
@@ -516,6 +517,7 @@ class ParallelGrid
   int       ipW, ipE;      /* MPI ranks for neighbor procs       */
   int       ipS, ipN;
   int       ipB, ipF;
+  bool      upwindRight;   /* Upwind dir: true/false == R/L      */
 
   int       dims[3];
   int       coords[3];
@@ -524,6 +526,7 @@ class ParallelGrid
 private:
   MPI_Comm     cart_comm;  /* MPI cartesian communicator         */
   MPI_Request  req[12];
+  int          nreq;
 
   BoundaryType bc;
   StencilType  st;
diff --git a/benchmarks/advection_reaction_3D/README.md b/benchmarks/advection_reaction_3D/raja/README.md
similarity index 78%
rename from benchmarks/advection_reaction_3D/README.md
rename to benchmarks/advection_reaction_3D/raja/README.md
index ab9974b660..33c82db725 100644
--- a/benchmarks/advection_reaction_3D/README.md
+++ b/benchmarks/advection_reaction_3D/raja/README.md
@@ -8,27 +8,31 @@ RAJA performance portability layer with serial, CUDA, or HIP backends.
 This code simulates the advection and reaction of three chemical species where
 the reaction mechanism is a variation of the Brusselator problem from chemical
 kinetics. The PDE system is given by
+```math
+\begin{align}
+  u_t &= -c \nabla u + A - (w+1) u + v u^2 \\
+  v_t &= -c \nabla v + w u - v u^2 \\
+  w_t &= -c \nabla w + (B - w) / \epsilon - w u
+\end{align}
 ```
-    u_t = -c grad(u) + A - (w+1) * u + v * u^2
-    v_t = -c grad(v) + w * u - v * u^2
-    w_t = -c grad(w) + (B - w) / epsilon - w * u
-```
-where `u`, `v`, and `w` are chemical concentrations, `c` is the advection speed,
-`A` and `B` are the concentrations of chemical species that remain constant over
-space and time, and `epsilon` is a parameter that varies the stiffness of the
-system. The problem is solved on the domain `(x,y,z) = X` in `[0, X_max]^3`,
-for times `t` in `[0,t_f]`. The initial condition is
-```
-    u(0,X) = A + p(X)
-    v(0,X) = B / A + p(X)
-    w(0,X) = 3.0 + p(X)
+where $u$, $v$, and $w$ are chemical concentrations, $c$ is the advection speed,
+$A$ and $B$ are the concentrations of chemical species that remain constant over
+space and time, and $\epsilon$ is a parameter that varies the stiffness of the
+system. The problem is solved on the domain $(x,y,z) = X$ in $[0, X_{\text{max}}]^3$,
+for times $t$ in $[0,t_f]$. The initial condition is
+```math
+\begin{align}
+    u(0,X) &= A + p(X) \\
+    v(0,X) &= B / A + p(X) \\
+    w(0,X) &= 3.0 + p(X)
+\end{align}
 ```
 where the perturbation function is
+```math
+    p(X) = \alpha e^{-(X-\mu)^T \sigma^{-1} (X-\mu) / 2 \sqrt{|\sigma| 8 \pi^3}}
 ```
-    p(X) = alpha * e^( -((X-mu)^T sigma^{-1} (X-mu)) / (2*sqrt(|sigma| 8 pi^3)) )
-```
-with `alpha = 0.1`, `mu = 0.5 X_max`, and `sigma` is a diagonal matrix with
-entries `0.25 X_max`.
+with $\alpha = 0.1$, $\mu = 0.5 X_{\text{max}}$, and $\sigma$ is a diagonal 
+matrix with entries $0.25 X_{\text{max}}$.
 
 Spatial derivatives are discretized with first-order upwind finite differences
 on a uniform spatial grid. The system can be evolved in time using explicit,
@@ -64,7 +68,7 @@ listed below.
 | `--method <method>`         | Integrator to use: `ERK`, `ARK-DIRK`, `ARK-IMEX`, `CV-BDF`, `CV-ADAMS`, `IDA` | `ARK-DIRK`  |
 | `--nls <method>`            | Nonlinear Solver Method: `newton`, `tl-newton`, `fixedpoint`, `none`          | `newton`    |
 | `--fpaccel <int>`           | Number of fixed point acceleration vectors                                    | 3           |
-| `--nopre`                   | Disable preconditioning                                                       | False       | 
+| `--nopre`                   | Disable preconditioning                                                       | False       |
 | `--fused`                   | Enabled fused operations                                                      | Off         |
 | `--tf <realtype>`           | Final integration time `t_f`                                                  | 10.0        |
 | `--rtol <realtype>`         | Relative tolerance                                                            | 1.0e-6      |
@@ -79,11 +83,11 @@ GPUs. See the installation guide for more details on configuring, building,
 and installing SUNDIALS.
 
 Based on the configuration the following executables will be built and installed
-in the `<install prefix>/bin/benchmarks/advection_reaction_3D` directory:
+in the `<benchmarks install prefix>/advection_reaction_3D/raja` directory:
 
-* `advection_reaction_3D` -- MPI parallelism
-* `advection_reaction_3D_mpicuda` -- MPI + CUDA parallelism
-* `advection_reaction_3D_mpihip` -- MPI + HIP parallelism
+* `advection_reaction_3D_raja` -- MPI parallelism
+* `advection_reaction_3D_raja_mpicuda` -- MPI + CUDA parallelism
+* `advection_reaction_3D_raja_mpihip` -- MPI + HIP parallelism
 
 On Summit, with the default environment
 ```
@@ -93,7 +97,7 @@ On Summit, with the default environment
 ```
 an example `jsrun` command is
 ```
-jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_mpicuda
+jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_raja_mpicuda
 ```
 
 On Lassen, with the environment
@@ -104,5 +108,5 @@ On Lassen, with the environment
 ```
 an example `jsrun` command is
 ```
-jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_mpicuda
+jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_raja_mpicuda
 ```
diff --git a/benchmarks/advection_reaction_3D/advection_reaction_3D.cpp b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.cpp
similarity index 71%
rename from benchmarks/advection_reaction_3D/advection_reaction_3D.cpp
rename to benchmarks/advection_reaction_3D/raja/advection_reaction_3D.cpp
index dc169c5fa1..088e4536a0 100644
--- a/benchmarks/advection_reaction_3D/advection_reaction_3D.cpp
+++ b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.cpp
@@ -1,5 +1,6 @@
 /* -----------------------------------------------------------------------------
  * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL
+ *                Daniel R. Reynolds @ SMU
  * -----------------------------------------------------------------------------
  * SUNDIALS Copyright Start
  * Copyright (c) 2002-2023, Lawrence Livermore National Security
@@ -60,9 +61,13 @@
 
 #include "advection_reaction_3D.hpp"
 
+#define STENCIL_WIDTH 1
+
+
 /* Main Program */
 int main(int argc, char *argv[])
 {
+
   SUNContext ctx;
 
   /* Initialize MPI */
@@ -87,7 +92,6 @@ int main(int argc, char *argv[])
     UserData     udata(ctx);    /* user data                    */
     UserOptions  uopt;          /* user options                 */
     int          retval;        /* reusable error-checking flag */
-    char         fname[MXSTR];
 
     SUNDIALS_CXX_MARK_FUNCTION(udata.prof);
 
@@ -113,6 +117,7 @@ int main(int argc, char *argv[])
     /* Output spatial mesh to disk (add extra point for periodic BC) */
     if (udata.myid == 0 && uopt.nout > 0)
     {
+      char fname[MXSTR];
       snprintf(fname, MXSTR, "%s/mesh.txt", uopt.outputdir);
       udata.grid->MeshToFile(fname);
     }
@@ -124,7 +129,6 @@ int main(int argc, char *argv[])
     else if (uopt.method == "CV-BDF")   retval = EvolveProblemBDF(y, &udata, &uopt);
     else if (uopt.method == "CV-ADAMS") retval = EvolveProblemAdams(y, &udata, &uopt);
     else if (uopt.method == "IDA")      retval = EvolveDAEProblem(y, &udata, &uopt);
-
     if (check_retval(&retval, "Evolve", 1, udata.myid)) MPI_Abort(comm, 1);
 
     /* Clean up */
@@ -142,15 +146,6 @@ int main(int argc, char *argv[])
 /* Destructor for problem data */
 UserData::~UserData()
 {
-  /* free solution masks */
-  N_VDestroy(N_VGetLocalVector_MPIPlusX(umask));
-  N_VDestroy(umask);
-  N_VDestroy(vmask);
-  N_VDestroy(wmask);
-
-  /* free the parallel grid */
-  delete grid;
-
   /* close output streams */
   if (uopt->nout > 0)
   {
@@ -159,6 +154,24 @@ UserData::~UserData()
     if (WFID) fclose(WFID);
     if (TFID && myid == 0) fclose(TFID);
   }
+
+  /* free solution masks */
+  if (umask != nullptr) {
+    N_VDestroy(N_VGetLocalVector_MPIPlusX(umask));
+    N_VDestroy(umask);
+    umask = nullptr;
+  }
+  if (vmask != nullptr) {
+    N_VDestroy(vmask);
+    vmask = nullptr;
+  }
+  if (wmask != nullptr) {
+    N_VDestroy(wmask);
+    wmask = nullptr;
+  }
+
+  /* free the parallel grid */
+  delete grid;
 }
 
 
@@ -166,175 +179,98 @@ UserData::~UserData()
  * Communication functions
  * --------------------------------------------------------------*/
 
-/* Exchanges the boundary conditions only, */
-int ExchangeBCOnly(N_Vector y, UserData* udata)
+/* Fills send buffers before exchanging neighbor information */
+int FillSendBuffers(N_Vector y, UserData* udata)
 {
-  int ierr;
-  MPI_Status stat;
-  MPI_Request reqR, reqS;
 
   /* shortcuts */
-  int nvar  = udata->grid->dof;
-  int myid  = udata->myid;
-  int first = 0;
-  int last  = udata->nprocs - 1;
+  const realtype c = udata->c;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const int dof = udata->grid->dof;
 
-  /* extract the data */
-  realtype* Ydata = GetVecData(y);
-  realtype* Wsend = udata->grid->getSendBuffer("WEST");
+  /* Create a 4D view of the vector */
+  RAJA::View<realtype, RAJA::Layout<4> > Yview(GetVecData(y),
+                                               nxl, nyl, nzl, dof);
 
-  /* open the East Irecv buffer */
-  if (myid == last)
-  {
-    ierr = MPI_Irecv(udata->grid->getRecvBuffer("EAST"), nvar, MPI_SUNREALTYPE, first,
-                     MPI_ANY_TAG, udata->comm, &reqR);
-  }
-
-  /* send first mesh node to the last processor */
-  if (myid == first)
-  {
-    RAJA::forall< EXEC_POLICY >( RAJA::RangeSegment(0, nvar),
-      [=] DEVICE_FUNC (int var) {
-      Wsend[IDX(nvar, 0, var)] = Ydata[IDX(nvar, 0, var)];
-    });
-    ierr = MPI_Isend(Wsend, nvar, MPI_SUNREALTYPE,
-                     last, 0, udata->comm, &reqS);
-  }
-
-  if (myid == last)
-  {
-    /* wait for exchange to finish */
-    ierr = MPI_Wait(&reqR, &stat);
-    if (ierr != MPI_SUCCESS)
-    {
-      fprintf(stderr, "\nERROR: error in MPI_Wait = %d\n", ierr);
-      return -1;
-    }
-  }
-
-  if (myid == first)
+  if (c > 0.0)
   {
-    /* wait for exchange to finish */
-    ierr = MPI_Wait(&reqS, &stat);
-    if (ierr != MPI_SUCCESS)
-    {
-      fprintf(stderr, "\nERROR: error in MPI_Wait = %d\n", ierr);
-      return -1;
-    }
-  }
-
-  return(0);
-}
 
+    /* Flow moving in the positive directions uses backward difference. */
 
-/* Starts the exchange of the neighbor information */
-int ExchangeAllStart(N_Vector y, UserData* udata)
-{
-  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
+    /* Fill 3D views of send buffers on device */
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Esend(udata->grid->getSendBuffer("EAST"),  nyl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Nsend(udata->grid->getSendBuffer("NORTH"), nxl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Fsend(udata->grid->getSendBuffer("FRONT"), nxl, nyl, dof);
+
+    auto east_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, nzl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(east_face,
+      [=] DEVICE_FUNC (int j, int k, int l) {
+        Esend(j,k,l) = Yview(nxl-1,j,k,l);
+    });
 
-  /* shortcuts */
-  realtype c = udata->c;
+    auto north_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nzl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(north_face,
+      [=] DEVICE_FUNC (int i, int k, int l) {
+        Nsend(i,k,l) = Yview(i,nyl-1,k,l);
+    });
 
-  /* extract the data */
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(GetVecData(y),
-                                                     udata->grid->nxl,
-                                                     udata->grid->nyl,
-                                                     udata->grid->nzl,
-                                                     udata->grid->dof);
+    auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nyl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(front_face,
+      [=] DEVICE_FUNC (int i, int j, int l) {
+        Fsend(i,j,l) = Yview(i,j,nzl-1,l);
+    });
 
-  if (c > 0.0)
-  {
-    /* Flow moving in the positive directions uses backward difference. */
-    udata->grid->ExchangeStart(
-      [=] (realtype*, realtype* Esend, realtype*, realtype* Nsend, realtype* Bsend, realtype*) {
-        int nxl = udata->grid->nxl;
-        int nyl = udata->grid->nyl;
-        int nzl = udata->grid->nzl;
-        int dof = udata->grid->dof;
-
-        auto range = RAJA::make_tuple(RAJA::RangeSegment(0, std::max(1,nxl-1)),
-                                      RAJA::RangeSegment(0, std::max(1,nyl-1)),
-                                      RAJA::RangeSegment(0, std::max(1,nzl-1)));
-
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Eview(Esend, nyl, nzl, dof);
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Nview(Nsend, nxl, nzl, dof);
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Bview(Bsend, nxl, nyl, dof);
-
-        RAJA::kernel<XYZ_KERNEL_POL>(range,
-          [=] DEVICE_FUNC (int i, int j, int k) {
-
-          if (nxl > 1)
-          {
-            Eview(j,k,0) = Yview(nxl-1,j,k,0);
-            Eview(j,k,1) = Yview(nxl-1,j,k,1);
-            Eview(j,k,2) = Yview(nxl-1,j,k,2);
-          }
-
-          if (nyl > 1)
-          {
-            Nview(i,k,0) = Yview(i,nyl-1,k,0);
-            Nview(i,k,1) = Yview(i,nyl-1,k,1);
-            Nview(i,k,2) = Yview(i,nyl-1,k,2);
-          }
-
-          if (nzl > 1)
-          {
-            Bview(i,j,0) = Yview(i,j,nzl-1,0);
-            Bview(i,j,1) = Yview(i,j,nzl-1,1);
-            Bview(i,j,2) = Yview(i,j,nzl-1,2);
-          }
-
-        });
-      });
   }
   else if (c < 0.0)
   {
+
     /* Flow moving in the negative directions uses forward difference. */
 
-    udata->grid->ExchangeStart(
-      [=] (realtype* Wsend, realtype*, realtype*Ssend, realtype*, realtype*, realtype* Fsend) {
-        auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl-1),
-                                      RAJA::RangeSegment(0, udata->grid->nyl-1),
-                                      RAJA::RangeSegment(0, udata->grid->nzl-1));
-
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Wview(Wsend, udata->grid->nyl, udata->grid->nzl, udata->grid->dof);
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Sview(Ssend, udata->grid->nxl, udata->grid->nzl, udata->grid->dof);
-        RAJA::View<realtype, RAJA::Layout<3> >
-          Fview(Fsend, udata->grid->nxl, udata->grid->nyl, udata->grid->dof);
-
-        RAJA::kernel<XYZ_KERNEL_POL>(range,
-          [=] DEVICE_FUNC (int i, int j, int k) {
-          Wview(j,k,0) = Yview(0,j,k,0);
-          Wview(j,k,1) = Yview(0,j,k,1);
-          Wview(j,k,2) = Yview(0,j,k,2);
-
-          Sview(i,k,0) = Yview(i,0,k,0);
-          Sview(i,k,1) = Yview(i,0,k,1);
-          Sview(i,k,2) = Yview(i,0,k,2);
-
-          Fview(i,j,0) = Yview(i,j,0,0);
-          Fview(i,j,1) = Yview(i,j,0,1);
-          Fview(i,j,2) = Yview(i,j,0,2);
-        });
-      });
-  }
+    /* Fill 3D views of send buffers on device */
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Wsend(udata->grid->getSendBuffer("WEST"),  nyl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Ssend(udata->grid->getSendBuffer("SOUTH"), nxl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Bsend(udata->grid->getSendBuffer("BACK"),  nxl, nyl, dof);
+
+    auto west_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, nzl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(west_face,
+      [=] DEVICE_FUNC (int j, int k, int l) {
+        Wsend(j,k,l) = Yview(0,j,k,l);
+    });
 
-  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
-  return(0);
-}
+    auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nzl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(south_face,
+      [=] DEVICE_FUNC (int i, int k, int l) {
+        Ssend(i,k,l) = Yview(i,0,k,l);
+    });
 
+    auto back_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                      RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(back_face,
+      [=] DEVICE_FUNC (int i, int j, int l) {
+        Bsend(i,j,l) = Yview(i,j,0,l);
+    });
+
+  }
 
-/* Completes the exchange of the neighbor information */
-int ExchangeAllEnd(UserData* udata)
-{
-  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
-  udata->grid->ExchangeEnd();
-  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
   return(0);
 }
 
@@ -494,17 +430,20 @@ int ComponentMask(N_Vector mask, int component, const UserData* udata)
 
   N_VConst(0.0, mask);
 
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > mask_view(GetVecData(mask),
-                                                         udata->grid->nxl,
-                                                         udata->grid->nyl,
-                                                         udata->grid->nzl,
-                                                         udata->grid->dof);
+  /* Create 4D view of mask data */
+  RAJA::View<realtype, RAJA::Layout<4> > mask_view(GetVecData(mask),
+                                                   udata->grid->nxl,
+                                                   udata->grid->nyl,
+                                                   udata->grid->nzl,
+                                                   udata->grid->dof);
+  /* Fill mask data */
   auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
                                 RAJA::RangeSegment(0, udata->grid->nyl),
                                 RAJA::RangeSegment(0, udata->grid->nzl));
   RAJA::kernel<XYZ_KERNEL_POL>(range,
-    [=] DEVICE_FUNC (int xi, int yi, int zi) {
-    mask_view(xi,yi,zi,component) = 1.0;
+    [=] DEVICE_FUNC (int i, int j, int k)
+  {
+    mask_view(i,j,k,component) = 1.0;
   });
 
   return 0;
@@ -515,14 +454,9 @@ int ComponentMask(N_Vector mask, int component, const UserData* udata)
 int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
                  SUNMemoryHelper memhelper, SUNContext ctx)
 {
-  constexpr int STENCIL_WIDTH = 1;
 
   SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
 
-  /* Local variables */
-  int retval = 0;
-  char fname[MXSTR];
-
   /* MPI variables */
   udata->comm = MPI_COMM_WORLD;
   MPI_Comm_rank(udata->comm, &udata->myid);
@@ -567,16 +501,16 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
   uopt->outputdir = (char *) "."; /* output directory         */
 
   /* Parse CLI args and set udata/uopt appropriately */
-  retval = ParseArgs(argc, argv, udata, uopt);
+  int retval = ParseArgs(argc, argv, udata, uopt);
   if (check_retval((void*)&retval, "ParseArgs", 1, udata->myid)) return -1;
 
   /* Setup the parallel decomposition */
   const sunindextype npts[] = {uopt->npts, uopt->npts, uopt->npts};
   const realtype amax[] = {0.0, 0.0, 0.0};
   const realtype bmax[] = {udata->xmax, udata->xmax, udata->xmax};
-  udata->grid = new ParallelGrid<realtype,sunindextype,NDIMS>(memhelper,
-    &udata->comm, amax, bmax, npts, 3, BoundaryType::PERIODIC, StencilType::UPWIND, STENCIL_WIDTH, uopt->npxyz
-  );
+  udata->grid = new ParallelGrid<realtype,sunindextype>(memhelper, &udata->comm,
+    amax, bmax, npts, 3, BoundaryType::PERIODIC, StencilType::UPWIND, udata->c,
+    STENCIL_WIDTH, uopt->npxyz);
 
   /* Create the solution masks */
   udata->umask = N_VMake_MPIPlusX(udata->comm, LocalNvector(udata->grid->neq, ctx), ctx);
@@ -589,6 +523,7 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
   /* Open output files for results */
   if (uopt->save)
   {
+    char fname[MXSTR];
     if (udata->myid == 0)
     {
       sprintf(fname, "%s/t.%06d.txt", uopt->outputdir, udata->myid);
@@ -609,7 +544,7 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
   if (udata->myid == 0)
   {
     printf("\n\t\tAdvection-Reaction Test Problem\n\n");
-    printf("Using the %s NVECTOR\n", NVECTOR_ID_STRING);
+    printf("Using the MPI+%s NVECTOR\n", NVECTOR_ID_STRING);
     printf("Number of Processors = %li\n", (long int) udata->nprocs);
     udata->grid->PrintInfo();
     printf("Problem Parameters:\n");
@@ -632,7 +567,6 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
     printf("Output directory: %s\n", uopt->outputdir);
   }
 
-
   /* return success */
   return(0);
 }
@@ -644,8 +578,8 @@ void Gaussian3D(realtype& x, realtype& y, realtype& z, realtype xmax)
 {
   /* Gaussian distribution defaults */
   const realtype alpha = 0.1;
-  const realtype mu[3] = { xmax/RCONST(2.0), xmax/RCONST(2.0), xmax/RCONST(2.0) };
-  const realtype sigma[3] = { xmax/RCONST(4.0), xmax/RCONST(4.0), xmax/RCONST(4.0) }; // Sigma = diag(sigma)
+  const realtype mu[] = { xmax/RCONST(2.0), xmax/RCONST(2.0), xmax/RCONST(2.0) };
+  const realtype sigma[] = { xmax/RCONST(4.0), xmax/RCONST(4.0), xmax/RCONST(4.0) }; // Sigma = diag(sigma)
 
   /* denominator = 2*sqrt(|Sigma|*(2pi)^3) */
   const realtype denom = 2.0 * sqrt((sigma[0]*sigma[1]*sigma[2])*pow(2*M_PI,3));
@@ -664,6 +598,7 @@ int SetIC(N_Vector y, UserData* udata)
   const int      nxl  = udata->grid->nxl;
   const int      nyl  = udata->grid->nyl;
   const int      nzl  = udata->grid->nzl;
+  const int      dof  = udata->grid->dof;
   const realtype dx   = udata->grid->dx;
   const realtype dy   = udata->grid->dy;
   const realtype dz   = udata->grid->dz;
@@ -683,22 +618,25 @@ int SetIC(N_Vector y, UserData* udata)
   const realtype vs = k2 * k4 * B / (k1 * k3 * A);
   const realtype ws = 3.0;
 
+  /* Create 4D view of y */
+  RAJA::View<realtype, RAJA::Layout<4> > yview(GetVecData(y),
+                                               nxl, nyl, nzl, dof);
+
   /* Gaussian perturbation of the steady state solution */
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > yview(GetVecData(y), nxl, nyl, nzl,
-                                                     udata->grid->dof);
   auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
                                 RAJA::RangeSegment(0, nyl),
                                 RAJA::RangeSegment(0, nzl));
   RAJA::kernel<XYZ_KERNEL_POL>(range,
-    [=] DEVICE_FUNC (int xi, int yi, int zi) {
-    realtype x = (xcrd * nxl + xi) * dx;
-    realtype y = (ycrd * nyl + yi) * dy;
-    realtype z = (zcrd * nzl + zi) * dz;
+    [=] DEVICE_FUNC (int i, int j, int k)
+  {
+    realtype x = (xcrd * nxl + i) * dx;
+    realtype y = (ycrd * nyl + j) * dy;
+    realtype z = (zcrd * nzl + k) * dz;
     Gaussian3D(x,y,z,xmax);
     const realtype p = x + y + z;
-    yview(xi,yi,zi,0) = us + p;
-    yview(xi,yi,zi,1) = vs + p;
-    yview(xi,yi,zi,2) = ws + p;
+    yview(i,j,k,0) = us + p;
+    yview(i,j,k,1) = vs + p;
+    yview(i,j,k,2) = ws + p;
   });
 
   /* Return success */
@@ -710,23 +648,17 @@ int SetIC(N_Vector y, UserData* udata)
 int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt)
 {
   SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
-  
-  realtype  u, v, w, N;
-  realtype* ydata = NULL;
-
-  /* get vector data array */
-  ydata = N_VGetArrayPointer(y);
-  if (check_retval((void *) ydata, "N_VGetArrayPointer", 0, udata->myid)) return -1;
 
+  /* Copy solution data to host mirror view */
   CopyVecFromDevice(N_VGetLocalVector_MPIPlusX(y));
 
   /* output current solution norm to screen */
-  N = (realtype) udata->grid->npts();
-  u = N_VWL2Norm(y, udata->umask);
+  realtype N = (realtype) udata->grid->npts();
+  realtype u = N_VWL2Norm(y, udata->umask);
   u = sqrt(u*u/N);
-  v = N_VWL2Norm(y, udata->vmask);
+  realtype v = N_VWL2Norm(y, udata->vmask);
   v = sqrt(v*v/N);
-  w = N_VWL2Norm(y, udata->wmask);
+  realtype w = N_VWL2Norm(y, udata->wmask);
   w = sqrt(w*w/N);
   if (udata->myid == 0) {
     printf("     %10.6f   %10.6f   %10.6f   %10.6f\n", t, u, v, w);
@@ -736,32 +668,38 @@ int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt)
   if (uopt->save)
   {
     /* output the times to disk */
-    if (udata->myid == 0 && udata->TFID)
+    if (udata->myid == 0 && udata->TFID) {
       fprintf(udata->TFID," %.16e\n", t);
+      std::fflush(udata->TFID);
+    }
+
+    /* create 4D view of host data */
+    realtype* ydata = NULL;
+    ydata = N_VGetArrayPointer(y);
+    if (check_retval((void *) ydata, "N_VGetArrayPointer", 0, udata->myid)) return -1;
+    const int nxl = udata->grid->nxl;
+    const int nyl = udata->grid->nyl;
+    const int nzl = udata->grid->nzl;
+    const int dof = udata->grid->dof;
+    RAJA::View<realtype, RAJA::Layout<4> > Yview(ydata, nxl, nyl, nzl, dof);
 
     /* output results to disk */
-    RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(ydata,
-                                                       udata->grid->nxl,
-                                                       udata->grid->nyl,
-                                                       udata->grid->nzl,
-                                                       udata->grid->dof);
-
-    auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
-                                  RAJA::RangeSegment(0, udata->grid->nyl),
-                                  RAJA::RangeSegment(0, udata->grid->nzl));
-
-    RAJA::kernel<XYZ_KERNEL_SERIAL_POLICY>(range,
-      [=] (int i, int j, int k) {
-      fprintf(udata->UFID," %.16e", Yview(i,j,k,0));
-      fprintf(udata->VFID," %.16e", Yview(i,j,k,1));
-      fprintf(udata->WFID," %.16e", Yview(i,j,k,2));
-    });
+    for (int i = 0; i < nxl; i++)
+      for (int j = 0; j < nyl; j++)
+        for (int k = 0; k < nzl; k++) {
+          fprintf(udata->UFID," %.16e", Yview(i,j,k,0));
+          fprintf(udata->VFID," %.16e", Yview(i,j,k,1));
+          fprintf(udata->WFID," %.16e", Yview(i,j,k,2));
+        }
 
     fprintf(udata->UFID,"\n");
     fprintf(udata->VFID,"\n");
     fprintf(udata->WFID,"\n");
+    std::fflush(udata->UFID);
+    std::fflush(udata->VFID);
+    std::fflush(udata->WFID);
   }
-  
+
   return(0);
 }
 
@@ -799,4 +737,3 @@ void InputError(char *name)
 
   MPI_Barrier(MPI_COMM_WORLD);
 }
-
diff --git a/benchmarks/advection_reaction_3D/advection_reaction_3D.hpp b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.hpp
similarity index 91%
rename from benchmarks/advection_reaction_3D/advection_reaction_3D.hpp
rename to benchmarks/advection_reaction_3D/raja/advection_reaction_3D.hpp
index 4396e69eb5..e4227d62c7 100644
--- a/benchmarks/advection_reaction_3D/advection_reaction_3D.hpp
+++ b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.hpp
@@ -1,5 +1,6 @@
 /* -----------------------------------------------------------------------------
  * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL
+ *                Daniel R. Reynolds @ SMU
  * -----------------------------------------------------------------------------
  * SUNDIALS Copyright Start
  * Copyright (c) 2002-2023, Lawrence Livermore National Security
@@ -34,19 +35,9 @@ using sundials_tools::BoundaryType;
 using sundials_tools::StencilType;
 using std::string;
 
-/* Number of dimensions */
-constexpr int NDIMS = 3;
-
 /* Maximum size of output directory string */
 constexpr int MXSTR = 2048;
 
-/* Accessor macro:
-   n = number of state variables
-   i = mesh node index
-   c = component */
-#define IDX(n,i,c) ((n)*(i)+(c))
-
-
 /*
  * Data structure for problem options
  */
@@ -113,7 +104,7 @@ struct UserData
   realtype  c;    /* advection coefficient        */
 
   /* parallel mesh */
-  ParallelGrid<realtype,sunindextype,NDIMS>* grid;
+  ParallelGrid<realtype,sunindextype>* grid;
 
   /* count of implicit function evals by the task local nonlinear solver */
   long int nnlfi;
@@ -122,7 +113,10 @@ struct UserData
   UserOptions* uopt;
 
   /* constructor that takes the context */
-  UserData(SUNContext ctx) : ctx(ctx) {
+  UserData(SUNContext ctx)
+    : ctx(ctx), umask(nullptr), vmask(nullptr), wmask(nullptr), uopt(nullptr),
+      TFID(nullptr), UFID(nullptr), VFID(nullptr), WFID(nullptr)
+  {
     SUNContext_GetProfiler(ctx, &prof);
   }
 
@@ -161,15 +155,14 @@ extern int EvolveDAEProblem(N_Vector y, UserData* udata, UserOptions* uopt);
 /* function to set initial condition */
 int SetIC(N_Vector y, UserData* udata);
 
-/* functions to exchange neighbor data */
-int ExchangeBCOnly(N_Vector y, UserData* udata);
-int ExchangeAllStart(N_Vector y, UserData* udata);
-int ExchangeAllEnd(UserData* udata);
+/* function to fill neighbor data */
+int FillSendBuffers(N_Vector y, UserData* udata);
 
 /* functions for processing command line args */
 int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt,
                  SUNMemoryHelper memhelper, SUNContext ctx);
 void InputError(char *name);
+int ComponentMask(N_Vector mask, const int component, const UserData* udata);
 
 /* function to write solution to disk */
 int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt);
diff --git a/benchmarks/advection_reaction_3D/raja/arkode_driver.cpp b/benchmarks/advection_reaction_3D/raja/arkode_driver.cpp
new file mode 100644
index 0000000000..e2cf1451e3
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/arkode_driver.cpp
@@ -0,0 +1,782 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * ---------------------------------------------------------------------------*/
+
+#include "arkode/arkode_arkstep.h"
+#include "arkode/arkode_erkstep.h"
+#include "sunlinsol/sunlinsol_spgmr.h"
+#include "sunnonlinsol/sunnonlinsol_newton.h"
+#include "sunnonlinsol/sunnonlinsol_fixedpoint.h"
+#include "advection_reaction_3D.hpp"
+#include "rhs3D.hpp"
+
+/*
+ * Definitions for a custom task local SUNNonlinearSolver
+ */
+
+typedef struct
+{
+  int                myid;
+  int                nprocs;
+  long int           ncnf;
+  MPI_Comm           comm;
+  SUNNonlinearSolver local_nls;
+} *TaskLocalNewton_Content;
+
+/* Content accessor macors */
+#define GET_NLS_CONTENT(NLS) ( (TaskLocalNewton_Content)(NLS->content) )
+#define LOCAL_NLS(NLS)       ( GET_NLS_CONTENT(NLS)->local_nls )
+
+/* SUNNonlinearSolver constructor */
+SUNNonlinearSolver TaskLocalNewton(SUNContext ctx, N_Vector y, FILE* DFID);
+
+
+/* --------------------------------------------------------------
+ * Evolve functions
+ * --------------------------------------------------------------*/
+
+/* Setup ARKODE and evolve problem in time with IMEX method */
+int EvolveProblemDIRK(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*              arkode_mem = NULL;  /* empty ARKODE memory structure    */
+  SUNNonlinearSolver NLS = NULL;         /* empty nonlinear solver structure */
+  SUNLinearSolver    LS  = NULL;         /* empty linear solver structure    */
+
+  realtype t, dtout, tout;    /* current/output time data     */
+  int      retval;            /* reusable error-checking flag */
+  int      iout;              /* output counter               */
+  long int nst, nst_a, netf;  /* step stats                   */
+  long int nfe, nfi;          /* RHS stats                    */
+  long int nni, ncnf;         /* nonlinear solver stats       */
+  long int nli, npsol;        /* linear solver stats          */
+  FILE*    DFID = NULL;       /* diagnostics output file      */
+  char     fname[MXSTR];
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = true;
+
+  /* Create the ARK timestepper module */
+  arkode_mem = ARKStepCreate(NULL, AdvectionReaction, uopt->t0, y, udata->ctx);
+  if (check_retval((void*)arkode_mem, "ARKStepCreate", 0, udata->myid)) return 1;
+
+  /* Select the method order */
+  retval = ARKStepSetOrder(arkode_mem, uopt->order);
+  if (check_retval(&retval, "ARKStepSetOrder", 1, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = ARKStepSetUserData(arkode_mem, (void*) udata);
+  if (check_retval(&retval, "ARKStepSetUserData*", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = ARKStepSStolerances(arkode_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "ARKStepSStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = ARKStepSetMaxNumSteps(arkode_mem, 100000);
+  if (check_retval(&retval, "ARKStepSetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Open output file for integrator diagnostics */
+  if (uopt->save)
+  {
+    sprintf(fname, "%s/diagnostics.%06d.txt", uopt->outputdir, udata->myid);
+    DFID = fopen(fname, "w");
+
+    retval = ARKStepSetDiagnostics(arkode_mem, DFID);
+    if (check_retval(&retval, "ARKStepSetDiagnostics", 1, udata->myid)) return 1;
+  }
+
+  /* Create the (non)linear solver */
+  if (uopt->nls == "newton")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_Newton(y, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = ARKStepSetNonlinearSolver(arkode_mem, NLS);
+    if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1;
+
+    /* Create linear solver */
+    LS = uopt->precond ? SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx) : SUNLinSol_SPGMR(y, PREC_NONE, 0, udata->ctx);
+    if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1;
+
+    /* Attach linear solver */
+    retval = ARKStepSetLinearSolver(arkode_mem, LS, NULL);
+    if (check_retval(&retval, "ARKStepSetLinearSolver", 1, udata->myid)) return 1;
+
+    /* Attach preconditioner */
+    retval = ARKStepSetPreconditioner(arkode_mem, NULL, PSolve);
+    if (check_retval(&retval, "ARKStepSetPreconditioner", 1, udata->myid)) return 1;
+  }
+  else if (uopt->nls == "fixedpoint")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = ARKStepSetNonlinearSolver(arkode_mem, NLS);
+    if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1;
+  }
+  else
+  {
+    fprintf(stderr, "\nERROR: ARK-DIRK is not compatible with the nls option provided\n");
+    return 1;
+  }
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = ARKStepEvolve(arkode_mem, tout, y, &t, ARK_NORMAL);
+    if (check_retval(&retval, "ARKStepEvolve", 1, udata->myid)) break;
+
+    /* Output state */
+    if(uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* close output stream */
+  if (uopt->save) fclose(DFID);
+
+  /* Get final statistics */
+  retval = ARKStepGetNumSteps(arkode_mem, &nst);
+  check_retval(&retval, "ARKStepGetNumSteps", 1, udata->myid);
+  retval = ARKStepGetNumStepAttempts(arkode_mem, &nst_a);
+  check_retval(&retval, "ARKStepGetNumStepAttempts", 1, udata->myid);
+  retval = ARKStepGetNumRhsEvals(arkode_mem, &nfe, &nfi);
+  check_retval(&retval, "ARKStepGetNumRhsEvals", 1, udata->myid);
+  retval = ARKStepGetNumErrTestFails(arkode_mem, &netf);
+  check_retval(&retval, "ARKStepGetNumErrTestFails", 1, udata->myid);
+  retval = ARKStepGetNumNonlinSolvIters(arkode_mem, &nni);
+  check_retval(&retval, "ARKStepGetNumNonlinSolvIters", 1, udata->myid);
+  retval = ARKStepGetNumNonlinSolvConvFails(arkode_mem, &ncnf);
+  check_retval(&retval, "ARKStepGetNumNonlinSolvConvFails", 1, udata->myid);
+  if (uopt->nls == "newton")
+  {
+    retval = ARKStepGetNumLinIters(arkode_mem, &nli);
+    check_retval(&retval, "ARKStepGetNumLinIters", 1, udata->myid);
+    retval = ARKStepGetNumPrecSolves(arkode_mem, &npsol);
+    check_retval(&retval, "ARKStepGetNumPrecSolves", 1, udata->myid);
+  }
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li (attempted = %li)\n", nst, nst_a);
+    printf("   Total RHS evals:  Fe = %li,  Fi = %li\n", nfe, nfi + udata->nnlfi);
+    printf("   Total number of error test failures = %li\n", netf);
+    printf("   Total number of nonlinear solver convergence failures = %li\n",
+           ncnf);
+    printf("   Total number of nonlinear iterations = %li\n", nni);
+    if (uopt->nls == "newton")
+    {
+      printf("   Total number of linear iterations = %li\n", nli);
+      printf("   Total number of preconditioner solves = %li\n", npsol);
+    }
+  }
+
+  /* Clean up */
+  ARKStepFree(&arkode_mem);
+  SUNNonlinSolFree(NLS);
+  if (LS) SUNLinSolFree(LS);
+
+  /* Return success */
+  return(0);
+}
+
+
+/* Setup ARKODE and evolve problem in time with IMEX method */
+int EvolveProblemIMEX(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*              arkode_mem = NULL;  /* empty ARKODE memory structure    */
+  SUNNonlinearSolver NLS = NULL;         /* empty nonlinear solver structure */
+  SUNLinearSolver    LS  = NULL;         /* empty linear solver structure    */
+
+  realtype t, dtout, tout;    /* current/output time data     */
+  int      retval;            /* reusable error-checking flag */
+  int      iout;              /* output counter               */
+  long int nst, nst_a, netf;  /* step stats                   */
+  long int nfe, nfi;          /* RHS stats                    */
+  long int nni, ncnf;         /* nonlinear solver stats       */
+  long int nli, npsol;        /* linear solver stats          */
+  FILE*    DFID = NULL;       /* diagnostics output file      */
+  char     fname[MXSTR];
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = false;
+
+  /* Create the ARK timestepper module */
+  arkode_mem = ARKStepCreate(Advection, Reaction, uopt->t0, y, udata->ctx);
+  if (check_retval((void*)arkode_mem, "ARKStepCreate", 0, udata->myid)) return 1;
+
+  /* Select the method order */
+  retval = ARKStepSetOrder(arkode_mem, uopt->order);
+  if (check_retval(&retval, "ARKStepSetOrder", 1, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = ARKStepSetUserData(arkode_mem, (void*) udata);
+  if (check_retval(&retval, "ARKStepSetUserData*", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = ARKStepSStolerances(arkode_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "ARKStepSStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = ARKStepSetMaxNumSteps(arkode_mem, 100000);
+  if (check_retval(&retval, "ARKStepSetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Open output file for integrator diagnostics */
+  if (uopt->save)
+  {
+    sprintf(fname, "%s/diagnostics.%06d.txt", uopt->outputdir, udata->myid);
+    DFID = fopen(fname, "w");
+
+    retval = ARKStepSetDiagnostics(arkode_mem, DFID);
+    if (check_retval(&retval, "ARKStepSetDiagnostics", 1, udata->myid)) return 1;
+  }
+
+  /* Create the (non)linear solver */
+  if (uopt->nls == "newton")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_Newton(y, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = ARKStepSetNonlinearSolver(arkode_mem, NLS);
+    if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1;
+
+    /* Create linear solver */
+    LS = SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx);
+    if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1;
+
+    /* Attach linear solver */
+    retval = ARKStepSetLinearSolver(arkode_mem, LS, NULL);
+    if (check_retval(&retval, "ARKStepSetLinearSolver", 1, udata->myid)) return 1;
+
+    /* Attach preconditioner */
+    retval = ARKStepSetPreconditioner(arkode_mem, NULL, PSolve);
+    if (check_retval(&retval, "ARKStepSetPreconditioner", 1, udata->myid)) return 1;
+  }
+  else if (uopt->nls == "tl-newton")
+  {
+    /* The custom task-local nonlinear solver handles the linear solve
+       as well, so we do not need a SUNLinearSolver. */
+    NLS = TaskLocalNewton(udata->ctx, y, DFID);
+    if (check_retval((void *)NLS, "TaskLocalNewton", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = ARKStepSetNonlinearSolver(arkode_mem, NLS);
+    if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1;
+  }
+  else if (uopt->nls == "fixedpoint")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = ARKStepSetNonlinearSolver(arkode_mem, NLS);
+    if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1;
+  }
+  else
+  {
+    fprintf(stderr, "\nERROR: ARK-IMEX method is not compatible with the nls option provided\n");
+    return 1;
+  }
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = ARKStepEvolve(arkode_mem, tout, y, &t, ARK_NORMAL);
+    if (check_retval(&retval, "ARKStepEvolve", 1, udata->myid)) break;
+
+    /* Output state */
+    if(uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* close output stream */
+  if (uopt->save) fclose(DFID);
+
+  /* Get final statistics */
+  retval = ARKStepGetNumSteps(arkode_mem, &nst);
+  check_retval(&retval, "ARKStepGetNumSteps", 1, udata->myid);
+  retval = ARKStepGetNumStepAttempts(arkode_mem, &nst_a);
+  check_retval(&retval, "ARKStepGetNumStepAttempts", 1, udata->myid);
+  retval = ARKStepGetNumRhsEvals(arkode_mem, &nfe, &nfi);
+  check_retval(&retval, "ARKStepGetNumRhsEvals", 1, udata->myid);
+  retval = ARKStepGetNumErrTestFails(arkode_mem, &netf);
+  check_retval(&retval, "ARKStepGetNumErrTestFails", 1, udata->myid);
+  retval = ARKStepGetNumNonlinSolvIters(arkode_mem, &nni);
+  check_retval(&retval, "ARKStepGetNumNonlinSolvIters", 1, udata->myid);
+  retval = ARKStepGetNumNonlinSolvConvFails(arkode_mem, &ncnf);
+  check_retval(&retval, "ARKStepGetNumNonlinSolvConvFails", 1, udata->myid);
+  if (uopt->nls == "newton")
+  {
+    retval = ARKStepGetNumLinIters(arkode_mem, &nli);
+    check_retval(&retval, "ARKStepGetNumLinIters", 1, udata->myid);
+    retval = ARKStepGetNumPrecSolves(arkode_mem, &npsol);
+    check_retval(&retval, "ARKStepGetNumPrecSolves", 1, udata->myid);
+  }
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li (attempted = %li)\n", nst, nst_a);
+    printf("   Total RHS evals:  Fe = %li,  Fi = %li\n", nfe, nfi + udata->nnlfi);
+    printf("   Total number of error test failures = %li\n", netf);
+    printf("   Total number of nonlinear solver convergence failures = %li\n",
+           ncnf);
+    printf("   Total number of nonlinear iterations = %li\n", nni);
+    if (uopt->nls == "newton")
+    {
+      printf("   Total number of linear iterations = %li\n", nli);
+      printf("   Total number of preconditioner solves = %li\n", npsol);
+    }
+  }
+
+  /* Clean up */
+  ARKStepFree(&arkode_mem);
+  if (NLS) SUNNonlinSolFree(NLS);
+  if (LS) SUNLinSolFree(LS);
+
+  /* Return success */
+  return(0);
+}
+
+
+/* Setup ARKODE and evolve problem in time explicitly */
+int EvolveProblemExplicit(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*    arkode_mem = NULL; /* empty ARKODE memory structure */
+  realtype   t, dtout, tout;    /* current/output time data      */
+  int      retval;            /* reusable error-checking flag  */
+  int      iout;              /* output counter                */
+  long int nst, nst_a, netf;  /* step stats                    */
+  long int nfe;               /* RHS stats                     */
+  FILE*    DFID;              /* diagnostics output file       */
+  char     fname[MXSTR];
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = true;
+
+  /* Create the ERK timestepper module */
+  arkode_mem = ERKStepCreate(AdvectionReaction, uopt->t0, y, udata->ctx);
+  if (check_retval((void*)arkode_mem, "ERKStepCreate", 0, udata->myid)) return 1;
+
+  /* Select the method order */
+  retval = ERKStepSetOrder(arkode_mem, uopt->order);
+  if (check_retval(&retval, "ERKStepSetOrder", 1, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = ERKStepSetUserData(arkode_mem, (void*) udata);
+  if (check_retval(&retval, "ERKStepSetUserData", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = ERKStepSStolerances(arkode_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "ERKStepSStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = ERKStepSetMaxNumSteps(arkode_mem, 1000000);
+  if (check_retval(&retval, "ERKStepSetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Set fixed step size */
+  retval = ERKStepSetFixedStep(arkode_mem, 1e-5);
+  if (check_retval(&retval, "ERKStepSetFixedStep", 1, udata->myid)) return 1;
+
+  /* Open output file for integrator diagnostics */
+  if (uopt->save)
+  {
+    sprintf(fname, "%s/diagnostics.%06d.txt", uopt->outputdir, udata->myid);
+    DFID = fopen(fname, "w");
+
+    retval = ERKStepSetDiagnostics(arkode_mem, DFID);
+    if (check_retval(&retval, "ERKStepSetDiagnostics", 1, udata->myid)) return 1;
+  }
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = ERKStepEvolve(arkode_mem, tout, y, &t, ARK_NORMAL);
+    if (check_retval(&retval, "ERKStepEvolve", 1, udata->myid)) break;
+
+    /* Output state */
+    if(uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* close output stream */
+  if (uopt->save) fclose(DFID);
+
+  /* Get final statistics */
+  retval = ERKStepGetNumSteps(arkode_mem, &nst);
+  check_retval(&retval, "ERKStepGetNumSteps", 1, udata->myid);
+  retval = ERKStepGetNumStepAttempts(arkode_mem, &nst_a);
+  check_retval(&retval, "ERKStepGetNumStepAttempts", 1, udata->myid);
+  retval = ERKStepGetNumRhsEvals(arkode_mem, &nfe);
+  check_retval(&retval, "ERKStepGetNumRhsEvals", 1, udata->myid);
+  retval = ERKStepGetNumErrTestFails(arkode_mem, &netf);
+  check_retval(&retval, "ERKStepGetNumErrTestFails", 1, udata->myid);
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li (attempted = %li)\n", nst, nst_a);
+    printf("   Total RHS evals:  Fe = %li\n", nfe);
+    printf("   Total number of error test failures = %li\n", netf);
+  }
+
+  /* Clean up */
+  ERKStepFree(&arkode_mem);
+
+  /* Return success */
+  return(0);
+}
+
+
+/* --------------------------------------------------------------
+ * (Non)linear system functions
+ * --------------------------------------------------------------*/
+
+int TaskLocalNlsResidual(N_Vector ycor, N_Vector F, void* arkode_mem)
+{
+  /* temporary variables */
+  UserData* udata;
+  int      retval;
+  realtype   c[3];
+  N_Vector X[3];
+
+  /* nonlinear system data */
+  N_Vector z, zpred, Fi, sdata;
+  realtype   tcur, gamma;
+  void     *user_data;
+
+  ARKStepGetNonlinearSystemData(arkode_mem, &tcur, &zpred, &z, &Fi,
+                                &gamma, &sdata, &user_data);
+  udata = (UserData*) user_data;
+
+  /* update 'z' value as stored predictor + current corrector */
+  N_VLinearSum(1.0, N_VGetLocalVector_MPIPlusX(zpred),
+               1.0, (ycor),
+               N_VGetLocalVector_MPIPlusX(z));
+
+  /* compute implicit RHS and save for later */
+  retval = Reaction(tcur,
+                    N_VGetLocalVector_MPIPlusX(z),
+                    N_VGetLocalVector_MPIPlusX(Fi),
+                    user_data);
+  udata->nnlfi++; /* count calls to Fi as part of the nonlinear residual */
+  if (retval < 0) return(-1);
+  if (retval > 0) return(+1);
+
+  /* update with y, sdata, and gamma * fy */
+  X[0] = ycor;
+  c[0] = 1.0;
+  c[1] = -1.0;
+  X[1] = N_VGetLocalVector_MPIPlusX(sdata);
+  c[2] = -gamma;
+  X[2] = N_VGetLocalVector_MPIPlusX(Fi);
+
+  retval = N_VLinearCombination(3, c, X, F);
+  if (retval != 0) return(-1);
+
+  return(0);
+}
+
+
+int TaskLocalLSolve(N_Vector delta, void* arkode_mem)
+{
+  /* local variables */
+  UserData* udata = NULL;
+  int       retval;
+
+  /* nonlinear system data */
+  N_Vector z, zpred, Fi, sdata;
+  realtype tcur, gamma;
+  void*    user_data = NULL;
+
+  ARKStepGetNonlinearSystemData(arkode_mem, &tcur, &zpred, &z, &Fi,
+                                &gamma, &sdata, &user_data);
+  udata = (UserData*) user_data;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* set up I - gamma*J and solve */
+  retval = SolveReactionLinSys(z, delta, delta, gamma, udata);
+
+
+  return(retval);
+}
+
+
+SUNNonlinearSolver_Type TaskLocalNewton_GetType(SUNNonlinearSolver NLS)
+{
+  return SUNNONLINEARSOLVER_ROOTFIND;
+}
+
+
+int TaskLocalNewton_Initialize(SUNNonlinearSolver NLS)
+{
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return SUN_NLS_MEM_NULL;
+
+  /* override default system and lsolve functions with local versions */
+  SUNNonlinSolSetSysFn(LOCAL_NLS(NLS), TaskLocalNlsResidual);
+  SUNNonlinSolSetLSolveFn(LOCAL_NLS(NLS), TaskLocalLSolve);
+
+  return(SUNNonlinSolInitialize(LOCAL_NLS(NLS)));
+}
+
+
+int TaskLocalNewton_Solve(SUNNonlinearSolver NLS,
+                          N_Vector y0, N_Vector ycor,
+                          N_Vector w, realtype tol,
+                          booleantype callLSetup, void* mem)
+{
+  /* local variables */
+  MPI_Comm comm;
+  int solve_status, recover, nonrecover;
+
+  /* check that the inputs are non-null */
+  if ((NLS  == NULL) ||
+      (y0   == NULL) ||
+      (ycor == NULL) ||
+      (w    == NULL) ||
+      (mem  == NULL))
+    return SUN_NLS_MEM_NULL;
+
+  /* shortcuts */
+  comm = GET_NLS_CONTENT(NLS)->comm;
+
+  /* each tasks solves the local nonlinear system */
+  solve_status = SUNNonlinSolSolve(LOCAL_NLS(NLS),
+                                   N_VGetLocalVector_MPIPlusX(y0),
+                                   N_VGetLocalVector_MPIPlusX(ycor),
+                                   N_VGetLocalVector_MPIPlusX(w),
+                                   tol, callLSetup, mem);
+
+  /* if any process had a nonrecoverable failure, return it */
+  MPI_Allreduce(&solve_status, &nonrecover, 1, MPI_INT, MPI_MIN, comm);
+  if (nonrecover < 0) return nonrecover;
+
+  /* check if any process has a recoverable convergence failure */
+  MPI_Allreduce(&solve_status, &recover, 1, MPI_INT, MPI_MAX, comm);
+  if (recover == SUN_NLS_CONV_RECVR) GET_NLS_CONTENT(NLS)->ncnf++;
+
+  /* return success (recover == 0) or a recoverable error code (recover > 0) */
+  return recover;
+}
+
+
+int TaskLocalNewton_Free(SUNNonlinearSolver NLS)
+{
+  /* return if NLS is already free */
+  if (NLS == NULL)
+    return SUN_NLS_SUCCESS;
+
+  /* free items from contents, then the generic structure */
+  if (NLS->content)
+  {
+    SUNNonlinSolFree(LOCAL_NLS(NLS));
+    free(NLS->content);
+    NLS->content = NULL;
+  }
+
+  /* free the ops structure */
+  if (NLS->ops)
+  {
+    free(NLS->ops);
+    NLS->ops = NULL;
+  }
+
+  /* free the nonlinear solver */
+  free(NLS);
+
+  return SUN_NLS_SUCCESS;
+}
+
+
+int TaskLocalNewton_SetSysFn(SUNNonlinearSolver NLS,
+                             SUNNonlinSolSysFn SysFn)
+{
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return SUN_NLS_MEM_NULL;
+
+  return(SUNNonlinSolSetSysFn(LOCAL_NLS(NLS), SysFn));
+}
+
+
+int TaskLocalNewton_SetConvTestFn(SUNNonlinearSolver NLS,
+                                  SUNNonlinSolConvTestFn CTestFn,
+                                  void* ctest_data)
+{
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return SUN_NLS_MEM_NULL;
+
+  return(SUNNonlinSolSetConvTestFn(LOCAL_NLS(NLS), CTestFn, ctest_data));
+}
+
+
+int TaskLocalNewton_GetNumConvFails(SUNNonlinearSolver NLS,
+                                    long int *nconvfails)
+{
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return SUN_NLS_MEM_NULL;
+
+  *nconvfails = GET_NLS_CONTENT(NLS)->ncnf;
+  return(0);
+}
+
+
+SUNNonlinearSolver TaskLocalNewton(SUNContext ctx, N_Vector y, FILE* DFID)
+{
+  SUNNonlinearSolver NLS;
+  TaskLocalNewton_Content content;
+
+  /* Check that the supplied N_Vector is non-NULL */
+  if (y == NULL) return NULL;
+
+  /* Check that the supplied N_Vector is an MPIPlusX */
+  if (N_VGetVectorID(y) != SUNDIALS_NVEC_MPIPLUSX)
+    return NULL;
+
+  /* Create an empty nonlinear linear solver object */
+  NLS = SUNNonlinSolNewEmpty(ctx);
+  if (NLS == NULL) return NULL;
+
+  /* Attach operations */
+  NLS->ops->gettype         = TaskLocalNewton_GetType;
+  NLS->ops->initialize      = TaskLocalNewton_Initialize;
+  NLS->ops->solve           = TaskLocalNewton_Solve;
+  NLS->ops->free            = TaskLocalNewton_Free;
+  NLS->ops->setsysfn        = TaskLocalNewton_SetSysFn;
+  NLS->ops->setctestfn      = TaskLocalNewton_SetConvTestFn;
+  NLS->ops->getnumconvfails = TaskLocalNewton_GetNumConvFails;
+
+  /* Create content */
+  content = NULL;
+  content = (TaskLocalNewton_Content) malloc(sizeof *content);
+  if (content == NULL) { SUNNonlinSolFree(NLS); return NULL; }
+
+  /* Initialize all components of content to 0/NULL */
+  memset(content, 0, sizeof(*content));
+
+  /* Attach content */
+  NLS->content = content;
+
+  /* Fill general content */
+  void *tmpcomm = N_VGetCommunicator(y);
+  if (tmpcomm == NULL) { SUNNonlinSolFree(NLS); return NULL; }
+
+  MPI_Comm *comm = (MPI_Comm*) tmpcomm;
+  if ((*comm) == MPI_COMM_NULL) { SUNNonlinSolFree(NLS); return NULL; }
+
+  content->comm = *comm;
+
+  content->local_nls = SUNNonlinSol_Newton(N_VGetLocalVector_MPIPlusX(y), ctx);
+  if (content->local_nls == NULL) { SUNNonlinSolFree(NLS); return NULL; }
+
+  MPI_Comm_rank(content->comm, &content->myid);
+  MPI_Comm_size(content->comm, &content->nprocs);
+
+  content->ncnf = 0;
+
+  /* Setup the local nonlinear solver monitoring */
+  if (DFID != NULL)
+  {
+    SUNNonlinSolSetInfoFile_Newton(LOCAL_NLS(NLS), DFID);
+    SUNNonlinSolSetPrintLevel_Newton(LOCAL_NLS(NLS), 1);
+  }
+
+  return NLS;
+}
diff --git a/benchmarks/advection_reaction_3D/backends.hpp b/benchmarks/advection_reaction_3D/raja/backends.hpp
similarity index 100%
rename from benchmarks/advection_reaction_3D/backends.hpp
rename to benchmarks/advection_reaction_3D/raja/backends.hpp
diff --git a/benchmarks/advection_reaction_3D/raja/check_retval.h b/benchmarks/advection_reaction_3D/raja/check_retval.h
new file mode 100644
index 0000000000..887b7cea5d
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/check_retval.h
@@ -0,0 +1,57 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * ---------------------------------------------------------------------------*/
+
+#ifndef _SUNDIALS_CHECK_RETVAL_H_
+#define _SUNDIALS_CHECK_RETVAL_H_
+
+#include <stdio.h>
+
+/* --------------------------------------------------------------
+ * Function to check return values:
+ *
+ * opt == 0  means the function allocates memory and returns a
+ *           pointer so check if a NULL pointer was returned
+ * opt == 1  means the function returns an integer where a
+ *           value < 0 indicates an error occured
+ * --------------------------------------------------------------*/
+static int check_retval(void *returnvalue, const char *funcname, int opt, int myid)
+{
+  int* errvalue;
+
+  if (opt == 0 && returnvalue == NULL)
+  {
+    /* A NULL pointer was returned - no memory allocated */
+    if (myid == 0)
+      fprintf(stderr, "\nERROR: %s() failed - returned NULL pointer\n\n",
+              funcname);
+    return(1);
+  }
+  else if (opt == 1)
+  {
+    errvalue = (int *) returnvalue;
+
+    /* A value < 0 was returned - function failed */
+    if (*errvalue < 0)
+    {
+      if (myid == 0)
+        fprintf(stderr, "\nERROR: %s() returned %d\n\n", funcname, *errvalue);
+      return(1);
+    }
+  }
+
+  /* return success */
+  return(0);
+}
+
+#endif
diff --git a/benchmarks/advection_reaction_3D/raja/cvode_driver.cpp b/benchmarks/advection_reaction_3D/raja/cvode_driver.cpp
new file mode 100644
index 0000000000..e147ccd8c4
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/cvode_driver.cpp
@@ -0,0 +1,289 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * ---------------------------------------------------------------------------*/
+
+#include "cvode/cvode.h"
+#include "sunlinsol/sunlinsol_spgmr.h"
+#include "sunnonlinsol/sunnonlinsol_newton.h"
+#include "sunnonlinsol/sunnonlinsol_fixedpoint.h"
+#include "advection_reaction_3D.hpp"
+#include "rhs3D.hpp"
+
+
+/* Setup CVODE and evolve problem in time with BDF method */
+int EvolveProblemBDF(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*              cvode_mem = NULL;   /* empty CVODE memory structure    */
+  SUNNonlinearSolver NLS = NULL;         /* empty nonlinear solver structure */
+  SUNLinearSolver    LS  = NULL;         /* empty linear solver structure    */
+
+  realtype t, dtout, tout;    /* current/output time data     */
+  int      retval;            /* reusable error-checking flag */
+  int      iout;              /* output counter               */
+  long int nst, netf;         /* step stats                   */
+  long int nfi;               /* RHS stats                    */
+  long int nni, ncnf;         /* nonlinear solver stats       */
+  long int nli, npsol;        /* linear solver stats          */
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = true;
+
+  /* Create CVode */
+  cvode_mem = CVodeCreate(CV_BDF, udata->ctx);
+  if (check_retval((void*)cvode_mem, "CVodeCreate", 0, udata->myid)) return 1;
+
+  /* Initialize CVode */
+  retval = CVodeInit(cvode_mem, AdvectionReaction, uopt->t0, y);
+  if (check_retval((void*)cvode_mem, "CVodeInit", 0, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = CVodeSetUserData(cvode_mem, (void*) udata);
+  if (check_retval(&retval, "CVodeSetUserData*", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = CVodeSStolerances(cvode_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "CVodeSStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = CVodeSetMaxNumSteps(cvode_mem, 100000);
+  if (check_retval(&retval, "CVodeSetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Create the (non)linear solver */
+  if (uopt->nls == "newton")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_Newton(y, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = CVodeSetNonlinearSolver(cvode_mem, NLS);
+    if (check_retval(&retval, "CVodeSetNonlinearSolver", 1, udata->myid)) return 1;
+
+    /* Create linear solver */
+    LS = uopt->precond ? SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx) : SUNLinSol_SPGMR(y, PREC_NONE, 0, udata->ctx);
+    if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1;
+
+    /* Attach linear solver */
+    retval = CVodeSetLinearSolver(cvode_mem, LS, NULL);
+    if (check_retval(&retval, "CVodeSetLinearSolver", 1, udata->myid)) return 1;
+
+    /* Attach preconditioner */
+    retval = CVodeSetPreconditioner(cvode_mem, NULL, PSolve);
+    if (check_retval(&retval, "CVodeSetPreconditioner", 1, udata->myid)) return 1;
+  }
+  else if (uopt->nls == "fixedpoint")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = CVodeSetNonlinearSolver(cvode_mem, NLS);
+    if (check_retval(&retval, "CVodeSetNonlinearSolver", 1, udata->myid)) return 1;
+  }
+  else
+  {
+    fprintf(stderr, "\nERROR: CV-BDF method is not compatible with the nls option provided\n");
+    return 1;
+  }
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = CVode(cvode_mem, tout, y, &t, CV_NORMAL);
+    if (check_retval(&retval, "CVode", 1, udata->myid)) break;
+
+    /* Output state */
+    if (uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* Get final statistics */
+  retval = CVodeGetNumSteps(cvode_mem, &nst);
+  check_retval(&retval, "CVodeGetNumSteps", 1, udata->myid);
+  retval = CVodeGetNumRhsEvals(cvode_mem, &nfi);
+  check_retval(&retval, "CVodeGetNumRhsEvals", 1, udata->myid);
+  retval = CVodeGetNumErrTestFails(cvode_mem, &netf);
+  check_retval(&retval, "CVodeGetNumErrTestFails", 1, udata->myid);
+  retval = CVodeGetNumNonlinSolvIters(cvode_mem, &nni);
+  check_retval(&retval, "CVodeGetNumNonlinSolvIters", 1, udata->myid);
+  retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncnf);
+  check_retval(&retval, "CVodeGetNumNonlinSolvConvFails", 1, udata->myid);
+  if (uopt->nls == "newton")
+  {
+    retval = CVodeGetNumLinIters(cvode_mem, &nli);
+    check_retval(&retval, "CVodeGetNumLinIters", 1, udata->myid);
+    retval = CVodeGetNumPrecSolves(cvode_mem, &npsol);
+    check_retval(&retval, "CVodeGetNumPrecSolves", 1, udata->myid);
+  }
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li\n", nst);
+    printf("   Total RHS evals: %li\n", nfi + udata->nnlfi);
+    printf("   Total number of error test failures = %li\n", netf);
+    printf("   Total number of nonlinear solver convergence failures = %li\n",
+           ncnf);
+    printf("   Total number of nonlinear iterations = %li\n", nni);
+    if (uopt->nls == "newton")
+    {
+      printf("   Total number of linear iterations = %li\n", nli);
+      printf("   Total number of preconditioner solves = %li\n", npsol);
+    }
+  }
+
+  /* Clean up */
+  CVodeFree(&cvode_mem);
+  if (NLS) SUNNonlinSolFree(NLS);
+  if (LS)  SUNLinSolFree(LS);
+
+  /* Return success */
+  return(0);
+}
+
+
+/* Setup CVODE and evolve problem in time with Adams method */
+int EvolveProblemAdams(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*              cvode_mem = NULL;   /* empty CVODE memory structure    */
+  SUNNonlinearSolver NLS = NULL;         /* empty nonlinear solver structure */
+
+  realtype t, dtout, tout;    /* current/output time data     */
+  int      retval;            /* reusable error-checking flag */
+  int      iout;              /* output counter               */
+  long int nst, netf;         /* step stats                   */
+  long int nfi;               /* RHS stats                    */
+  long int nni, ncnf;         /* nonlinear solver stats       */
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = true;
+
+  /* Create CVode */
+  cvode_mem = CVodeCreate(CV_ADAMS, udata->ctx);
+  if (check_retval((void*)cvode_mem, "CVodeCreate", 0, udata->myid)) return 1;
+
+  /* Initialize CVode */
+  retval = CVodeInit(cvode_mem, AdvectionReaction, uopt->t0, y);
+  if (check_retval((void*)cvode_mem, "CVodeInit", 0, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = CVodeSetUserData(cvode_mem, (void*) udata);
+  if (check_retval(&retval, "CVodeSetUserData*", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = CVodeSStolerances(cvode_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "CVodeSStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = CVodeSetMaxNumSteps(cvode_mem, 100000);
+  if (check_retval(&retval, "CVodeSetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Create nonlinear solver */
+  NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx);
+  if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1;
+
+  /* Attach nonlinear solver */
+  retval = CVodeSetNonlinearSolver(cvode_mem, NLS);
+  if (check_retval(&retval, "CVodeSetNonlinearSolver", 1, udata->myid)) return 1;
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = CVode(cvode_mem, tout, y, &t, CV_NORMAL);
+    if (check_retval(&retval, "CVode", 1, udata->myid)) break;
+
+    /* Output state */
+    if (uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* Get final statistics */
+  retval = CVodeGetNumSteps(cvode_mem, &nst);
+  check_retval(&retval, "CVodeGetNumSteps", 1, udata->myid);
+  retval = CVodeGetNumRhsEvals(cvode_mem, &nfi);
+  check_retval(&retval, "CVodeGetNumRhsEvals", 1, udata->myid);
+  retval = CVodeGetNumErrTestFails(cvode_mem, &netf);
+  check_retval(&retval, "CVodeGetNumErrTestFails", 1, udata->myid);
+  retval = CVodeGetNumNonlinSolvIters(cvode_mem, &nni);
+  check_retval(&retval, "CVodeGetNumNonlinSolvIters", 1, udata->myid);
+  retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncnf);
+  check_retval(&retval, "CVodeGetNumNonlinSolvConvFails", 1, udata->myid);
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li\n", nst);
+    printf("   Total RHS evals: %li\n", nfi + udata->nnlfi);
+    printf("   Total number of error test failures = %li\n", netf);
+    printf("   Total number of nonlinear solver convergence failures = %li\n",
+           ncnf);
+  }
+
+  /* Clean up */
+  CVodeFree(&cvode_mem);
+  SUNNonlinSolFree(NLS);
+
+  /* Return success */
+  return(0);
+}
diff --git a/benchmarks/advection_reaction_3D/raja/ida_driver.cpp b/benchmarks/advection_reaction_3D/raja/ida_driver.cpp
new file mode 100644
index 0000000000..3ae28a43ca
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/ida_driver.cpp
@@ -0,0 +1,195 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * ---------------------------------------------------------------------------*/
+
+#include "ida/ida.h"
+#include "sunlinsol/sunlinsol_spgmr.h"
+#include "sunnonlinsol/sunnonlinsol_newton.h"
+#include "sunnonlinsol/sunnonlinsol_fixedpoint.h"
+#include "advection_reaction_3D.hpp"
+#include "rhs3D.hpp"
+
+
+/* Initial condition function */
+int SetICDot(N_Vector y, N_Vector yp, UserData* udata)
+{
+  int retval;
+
+  retval = AdvectionReaction(0, y, yp, (void*)udata);
+  if (check_retval(&retval, "AdvectionReaction", 1, udata->myid)) return 1;
+
+  /* Return success */
+  return(0);
+}
+
+
+/* Setup IDA and evolve problem in time with BDF method */
+int EvolveDAEProblem(N_Vector y, UserData* udata, UserOptions* uopt)
+{
+  void*              ida_mem = NULL;  /* empty IDA memory structure       */
+  SUNNonlinearSolver NLS = NULL;      /* empty nonlinear solver structure */
+  SUNLinearSolver    LS  = NULL;      /* empty linear solver structure    */
+  N_Vector           yp  = NULL;      /* empty vector structure           */
+
+  realtype t, dtout, tout;    /* current/output time data     */
+  int      retval;            /* reusable error-checking flag */
+  int      iout;              /* output counter               */
+  long int nst, netf;         /* step stats                   */
+  long int nfi;               /* RHS stats                    */
+  long int nni, ncnf;         /* nonlinear solver stats       */
+  long int nli, npsol;        /* linear solver stats          */
+
+  /* Additively split methods should not add the advection and reaction terms */
+  udata->add_reactions = true;
+
+  /* Create ydot' vector */
+  yp = N_VClone(y);
+  if (check_retval((void*)yp, "N_VClone", 0, udata->myid)) return 1;
+
+  /* Create IDA */
+  ida_mem = IDACreate(udata->ctx);
+  if (check_retval((void*)ida_mem, "IDACreate", 0, udata->myid)) return 1;
+
+  /* Initialize IDA */
+  retval = IDAInit(ida_mem, AdvectionReactionResidual, uopt->t0, y, yp);
+  if (check_retval(&retval, "IDAInit", 1, udata->myid)) return 1;
+
+  /* Attach user data */
+  retval = IDASetUserData(ida_mem, (void*) udata);
+  if (check_retval(&retval, "IDASetUserData*", 1, udata->myid)) return 1;
+
+  /* Specify tolerances */
+  retval = IDASStolerances(ida_mem, uopt->rtol, uopt->atol);
+  if (check_retval(&retval, "IDASStolerances", 1, udata->myid)) return 1;
+
+  /* Increase the max number of steps allowed between outputs */
+  retval = IDASetMaxNumSteps(ida_mem, 100000);
+  if (check_retval(&retval, "IDASetMaxNumSteps", 1, udata->myid)) return 1;
+
+  /* Increase the max number of ETF allowed between outputs */
+  retval = IDASetMaxErrTestFails(ida_mem, 25);
+  if (check_retval(&retval, "IDASetMaxErrTestFails", 1, udata->myid)) return 1;
+
+  /* Create the (non)linear solver */
+  if (uopt->nls == "newton")
+  {
+    /* Create nonlinear solver */
+    NLS = SUNNonlinSol_Newton(y, udata->ctx);
+    if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1;
+
+    /* Attach nonlinear solver */
+    retval = IDASetNonlinearSolver(ida_mem, NLS);
+    if (check_retval(&retval, "IDASetNonlinearSolver", 1, udata->myid)) return 1;
+
+    /* Create linear solver */
+    LS = uopt->precond ? SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx) : SUNLinSol_SPGMR(y, PREC_NONE, 0, udata->ctx);
+    if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1;
+
+    /* Attach linear solver */
+    retval = IDASetLinearSolver(ida_mem, LS, NULL);
+    if (check_retval(&retval, "IDASetLinearSolver", 1, udata->myid)) return 1;
+
+    // /* Attach preconditioner */
+    retval = IDASetPreconditioner(ida_mem, NULL, PSolveRes);
+    if (check_retval(&retval, "IDASetPreconditioner", 1, udata->myid)) return 1;
+  }
+  else
+  {
+    fprintf(stderr, "\nERROR: IDA method is not compatible with the nls option provided\n");
+    return 1;
+  }
+
+  /* Set ydot' initial condition */
+  retval = SetICDot(y, yp, udata);
+  if (check_retval(&retval, "SetICDot", 1, udata->myid)) return 1;
+
+  /* Output initial condition */
+  if (uopt->nout > 0)
+  {
+    if (udata->myid == 0)
+    {
+      printf("\n          t         ||u||_rms   ||v||_rms   ||w||_rms\n");
+      printf("   ----------------------------------------------------\n");
+    }
+    WriteOutput(uopt->t0, y, udata, uopt);
+  }
+
+  /* Integrate to final time */
+  t     = uopt->t0;
+  dtout = (uopt->tf - uopt->t0);
+  if (uopt->nout != 0)
+    dtout /= uopt->nout;
+  tout  = t + dtout;
+  iout  = 0;
+
+  do
+  {
+    /* Integrate to output time */
+    retval = IDASolve(ida_mem, tout, &t, y, yp, IDA_NORMAL);
+    if (check_retval(&retval, "IDA", 1, udata->myid)) break;
+
+    /* Output state */
+    if(uopt->nout > 0) WriteOutput(t, y, udata, uopt);
+
+    /* Update output time */
+    tout += dtout;
+    tout = (tout > uopt->tf) ? uopt->tf : tout;
+
+    iout++;
+  } while (iout < uopt->nout);
+
+  /* Get final statistics */
+  retval = IDAGetNumSteps(ida_mem, &nst);
+  check_retval(&retval, "IDAGetNumSteps", 1, udata->myid);
+  retval = IDAGetNumResEvals(ida_mem, &nfi);
+  check_retval(&retval, "IDAGetNumResEvals", 1, udata->myid);
+  retval = IDAGetNumErrTestFails(ida_mem, &netf);
+  check_retval(&retval, "IDAGetNumErrTestFails", 1, udata->myid);
+  retval = IDAGetNumNonlinSolvIters(ida_mem, &nni);
+  check_retval(&retval, "IDAGetNumNonlinSolvIters", 1, udata->myid);
+  retval = IDAGetNumNonlinSolvConvFails(ida_mem, &ncnf);
+  check_retval(&retval, "IDAGetNumNonlinSolvConvFails", 1, udata->myid);
+  if (uopt->nls == "newton")
+  {
+    retval = IDAGetNumLinIters(ida_mem, &nli);
+    check_retval(&retval, "IDAGetNumLinIters", 1, udata->myid);
+    retval = IDAGetNumPrecSolves(ida_mem, &npsol);
+    check_retval(&retval, "IDAGetNumPrecSolves", 1, udata->myid);
+  }
+
+  /* Print final statistics */
+  if (udata->myid == 0)
+  {
+    printf("\nFinal Solver Statistics (for processor 0):\n");
+    printf("   Internal solver steps = %li\n", nst);
+    printf("   Total RHS evals: %li\n", nfi + udata->nnlfi);
+    printf("   Total number of error test failures = %li\n", netf);
+    printf("   Total number of nonlinear solver convergence failures = %li\n",
+           ncnf);
+    printf("   Total number of nonlinear iterations = %li\n", nni);
+    if (uopt->nls == "newton")
+    {
+      printf("   Total number of linear iterations = %li\n", nli);
+      printf("   Total number of preconditioner solves = %li\n", npsol);
+    }
+  }
+
+  /* Clean up */
+  IDAFree(&ida_mem);
+  if (yp) N_VDestroy(yp);
+  if (NLS) SUNNonlinSolFree(NLS);
+  if (LS)  SUNLinSolFree(LS);
+
+  /* Return success */
+  return(0);
+}
diff --git a/benchmarks/advection_reaction_3D/raja/rhs3D.hpp b/benchmarks/advection_reaction_3D/raja/rhs3D.hpp
new file mode 100644
index 0000000000..1bb2b6f105
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/raja/rhs3D.hpp
@@ -0,0 +1,598 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL
+ *                Daniel R. Reynolds @ SMU
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------*/
+
+#ifndef ADVECTION_REACTION_3D_RHS_HPP
+#define ADVECTION_REACTION_3D_RHS_HPP
+
+#include "advection_reaction_3D.hpp"
+
+/* --------------------------------------------------------------
+ * Right hand side (RHS) and residual functions
+ * --------------------------------------------------------------*/
+
+/* Compute the advection term f(t,y) = -c (grad * y). This is done using
+   upwind 1st order finite differences. */
+static int Advection(realtype t, N_Vector y, N_Vector ydot, void* user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* set variable shortcuts */
+  const int      nxl = udata->grid->nxl;
+  const int      nyl = udata->grid->nyl;
+  const int      nzl = udata->grid->nzl;
+  const int      dof = udata->grid->dof;
+  const realtype c   = udata->c;
+  const realtype cx  = -c / udata->grid->dx;
+  const realtype cy  = -c / udata->grid->dy;
+  const realtype cz  = -c / udata->grid->dz;
+
+  /* local variables */
+  int retval;
+
+  /* fill send buffers and begin exchanging boundary information */
+  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
+  retval = FillSendBuffers(y, udata);
+  if (check_retval(&retval, "FillSendBuffers", 1, udata->myid))
+    return(-1);
+  retval = udata->grid->ExchangeStart();
+  if (check_retval(&retval, "ExchangeStart", 1, udata->myid))
+    return(-1);
+  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
+
+  /* set output to zero */
+  N_VConst(0.0, ydot);
+
+  /* create views of the state and RHS vectors */
+  RAJA::View<realtype, RAJA::Layout<4> > Yview(GetVecData(y), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4> > dYview(GetVecData(ydot), nxl, nyl, nzl, dof);
+
+  /* iterate over domain interior, computing advection */
+  if (c > 0.0)
+  {
+    /* flow moving in the positive x,y,z direction */
+    auto range = RAJA::make_tuple(RAJA::RangeSegment(1, nxl),
+                                  RAJA::RangeSegment(1, nyl),
+                                  RAJA::RangeSegment(1, nzl));
+    RAJA::kernel<XYZ_KERNEL_POL>(range,
+      [=] DEVICE_FUNC (int i, int j, int k) {
+      const realtype u_ijk = Yview(i,j,k,0);
+      const realtype v_ijk = Yview(i,j,k,1);
+      const realtype w_ijk = Yview(i,j,k,2);
+
+      // grad * u
+      dYview(i,j,k,0) =  cz * (u_ijk - Yview(i,j,k-1,0)); // du/dz
+      dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j-1,k,0)); // du/dy
+      dYview(i,j,k,0) += cx * (u_ijk - Yview(i-1,j,k,0)); // du/dx
+
+      // grad * v
+      dYview(i,j,k,1) =  cz * (v_ijk - Yview(i,j,k-1,1)); // dv/dz
+      dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j-1,k,1)); // dv/dy
+      dYview(i,j,k,1) += cx * (v_ijk - Yview(i-1,j,k,1)); // dv/dx
+
+      // grad * w
+      dYview(i,j,k,2) =  cz * (w_ijk - Yview(i,j,k-1,2)); // dw/dz
+      dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j-1,k,2)); // dw/dy
+      dYview(i,j,k,2) += cx * (w_ijk - Yview(i-1,j,k,2)); // dw/dx
+    });
+
+  }
+  else if (c < 0.0)
+  {
+    /* flow moving in the negative x,y,z direction */
+    auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1),
+                                  RAJA::RangeSegment(0, nyl-1),
+                                  RAJA::RangeSegment(0, nzl-1));
+    RAJA::kernel<XYZ_KERNEL_POL>(range,
+      [=] DEVICE_FUNC (int i, int j, int k) {
+      const realtype u_ijk = Yview(i,j,k,0);
+      const realtype v_ijk = Yview(i,j,k,1);
+      const realtype w_ijk = Yview(i,j,k,2);
+
+      // grad * u
+      dYview(i,j,k,0) =  cz * (Yview(i,j,k+1,0) - u_ijk); // du/dz
+      dYview(i,j,k,0) += cy * (Yview(i,j+1,k,0) - u_ijk); // du/dy
+      dYview(i,j,k,0) += cx * (Yview(i+1,j,k,0) - u_ijk); // du/dx
+
+      // grad * v
+      dYview(i,j,k,1) =  cz * (Yview(i,j,k+1,1) - v_ijk); // dv/dz
+      dYview(i,j,k,1) += cy * (Yview(i,j+1,k,1) - v_ijk); // dv/dy
+      dYview(i,j,k,1) += cx * (Yview(i+1,j,k,1) - v_ijk); // dv/dx
+
+      // grad * w
+      dYview(i,j,k,2) =  cz * (Yview(i,j,k+1,2) - w_ijk); // dw/dz
+      dYview(i,j,k,2) += cy * (Yview(i,j+1,k,2) - w_ijk); // dw/dy
+      dYview(i,j,k,2) += cx * (Yview(i+1,j,k,2) - w_ijk); // dw/dx
+    });
+
+  }
+
+  /* finish exchanging boundary information */
+  SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange");
+  retval = udata->grid->ExchangeEnd();
+  if (check_retval(&retval, "ExchangeEnd", 1, udata->myid))
+    return(-1);
+  SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange");
+
+
+  /* compute advection at process boundaries */
+  if (c > 0.0)
+  {
+    /* Flow moving in the positive x,y,z direction:
+    *  boundaries are west face, south face, back face */
+
+    /*   Perform calculations on each "lower" face */
+    RAJA::View<realtype, RAJA::Layout<3>>
+      Wrecv(udata->grid->getRecvBuffer("WEST"),  nyl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3>>
+      Srecv(udata->grid->getRecvBuffer("SOUTH"), nxl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3>>
+      Brecv(udata->grid->getRecvBuffer("BACK"),  nxl, nyl, dof);
+
+    auto west_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, nzl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(west_face,
+      [=] DEVICE_FUNC (int j, int k, int l) {
+      const int i = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,k,l);
+      const realtype YBack  = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - Wrecv(j,k,l)); // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - YSouth);       // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - YBack);        // d/dz
+    });
+
+    auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nzl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(south_face,
+      [=] DEVICE_FUNC (int i, int k, int l) {
+      const int j = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YWest = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(j,k,l);
+      const realtype YBack = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - YWest);        // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - Srecv(i,k,l)); // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - YBack);        // d/dz
+    });
+
+    auto back_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                      RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(back_face,
+      [=] DEVICE_FUNC (int i, int j, int l) {
+      const int k = 0;
+      const realtype Yijkl  = Yview(i,j,k,l);
+      const realtype YWest  = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(j,k,l);
+      const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,k,l);
+      dYview(i,j,k,l)  = cx * (Yijkl - YWest);        // d/dx
+      dYview(i,j,k,l) += cy * (Yijkl - YSouth);       // d/dy
+      dYview(i,j,k,l) += cz * (Yijkl - Brecv(i,j,l)); // d/dz
+    });
+
+  }
+  else if (c < 0.0)
+  {
+
+    /* Flow moving in the negative x,y,z direction:
+    *  boundaries are east face, north face, and front face */
+
+    /*   Perform calculations on each "upper" face */
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Erecv(udata->grid->getRecvBuffer("EAST"),  nyl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Nrecv(udata->grid->getRecvBuffer("NORTH"), nxl, nzl, dof);
+    RAJA::View<realtype, RAJA::Layout<3> >
+      Frecv(udata->grid->getRecvBuffer("FRONT"), nxl, nyl, dof);
+
+    auto east_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl),
+                                      RAJA::RangeSegment(0, nzl),
+                                      RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(east_face,
+      [=] DEVICE_FUNC (int j, int k, int l) {
+      const int i = nxl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,k,l);
+      const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,l);
+      dYview(i,j,k,l)  = cx * (Erecv(j,k,l) - Yijkl); // d/dx
+      dYview(i,j,k,l) += cy * (YNorth - Yijkl);       // d/dy
+      dYview(i,j,k,l) += cz * (YFront - Yijkl);       // d/dz
+    });
+
+    auto north_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nzl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(north_face,
+      [=] DEVICE_FUNC (int i, int k, int l) {
+      const int j = nyl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YEast  = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(j,k,l);
+      const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,l);
+      dYview(i,j,k,l)  = cx * (YEast - Yijkl);        // d/dx
+      dYview(i,j,k,l) += cy * (Nrecv(i,k,l) - Yijkl); // d/dy
+      dYview(i,j,k,l) += cz * (YFront - Yijkl);       // d/dz
+    });
+
+    auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                       RAJA::RangeSegment(0, nyl),
+                                       RAJA::RangeSegment(0, dof));
+    RAJA::kernel<XYZ_KERNEL_POL>(front_face,
+      [=] DEVICE_FUNC (int i, int j, int l) {
+      const int k = nzl-1;
+      const realtype Yijkl = Yview(i,j,k,l);
+      const realtype YEast  = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(j,k,l);
+      const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,k,l);
+      dYview(i,j,k,l)  = cx * (YEast - Yijkl);        // d/dx
+      dYview(i,j,k,l) += cy * (YNorth - Yijkl);       // d/dy
+      dYview(i,j,k,l) += cz * (Frecv(i,j,l) - Yijkl); // d/dz
+    });
+  }
+
+  /* return success */
+  return(0);
+}
+
+
+/* Compute the reaction term g(t,y). */
+static int Reaction(realtype t, N_Vector y, N_Vector ydot, void* user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* set variable shortcuts */
+  const realtype A  = udata->A;
+  const realtype B  = udata->B;
+  const realtype k1 = udata->k1;
+  const realtype k2 = udata->k2;
+  const realtype k3 = udata->k3;
+  const realtype k4 = udata->k4;
+  const realtype k5 = udata->k5;
+  const realtype k6 = udata->k6;
+  const int     nxl = udata->grid->nxl;
+  const int     nyl = udata->grid->nyl;
+  const int     nzl = udata->grid->nzl;
+  const int     dof = udata->grid->dof;
+
+  /* Zero output if not adding reactions to existing RHS */
+  if (!udata->add_reactions)
+    N_VConst(0.0, ydot);
+
+  /* access data arrays */
+  realtype* Ydata  = NULL;
+  Ydata = GetVecData(y);
+  if (check_retval((void *)Ydata, "GetVecData", 0, udata->myid))
+    return(-1);
+  realtype* dYdata = NULL;
+  dYdata = GetVecData(ydot);
+  if (check_retval((void *)dYdata, "GetVecData", 0, udata->myid))
+    return(-1);
+
+  /* create 4D views of state and RHS vectors */
+  RAJA::View<realtype, RAJA::Layout<4> > Yview(GetVecData(y), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4> > dYview(GetVecData(ydot), nxl, nyl, nzl, dof);
+
+  /* add reaction terms to RHS */
+  auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                RAJA::RangeSegment(0, nyl),
+                                RAJA::RangeSegment(0, nzl));
+  RAJA::kernel<XYZ_KERNEL_POL>(range,
+    [=] DEVICE_FUNC (int i, int j, int k) {
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+    dYview(i,j,k,0) += k1 * A - k2 * w * u + k3 * u * u * v - k4 * u;
+    dYview(i,j,k,1) += k2 * w * u - k3 * u * u * v;
+    dYview(i,j,k,2) += -k2 * w * u + k5 * B - k6 * w;
+  });
+
+  /* return success */
+  return(0);
+}
+
+
+/* Compute the RHS as h(t,y) = f(t,y) + g(t,y). */
+static int AdvectionReaction(realtype t, N_Vector y, N_Vector ydot,
+                             void *user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+  int retval;
+
+  /* NOTE: The order in which Advection and Reaction are called
+           is critical here. Advection must be computed first. */
+  retval = Advection(t, y, ydot, user_data);
+  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
+
+  retval = Reaction(t, y, ydot, user_data);
+  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
+
+  /* return success */
+  return(0);
+}
+
+/* Compute the residual F(t,y,y') = ydot - h(t,y) = 0. */
+static int AdvectionReactionResidual(realtype t, N_Vector y, N_Vector ydot,
+                                     N_Vector F, void *user_data)
+{
+  /* access problem data */
+  UserData* udata = (UserData*) user_data;
+  int retval;
+
+  /* NOTE: The order in which Advection and Reaction are called
+           is critical here. Advection must be computed first. */
+  retval = Advection(t, y, F, user_data); /* F = -c y_x */
+  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
+
+  retval = Reaction(t, y, F, user_data);  /* F = -c y_x + g(t,y) */
+  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
+
+  /* F = ydot - h(t,y) = ydot + c y_x - g(t,y) */
+  N_VLinearSum(1.0, ydot, -1.0, F, F);
+
+  /* return success */
+  return(0);
+}
+
+/* --------------------------------------------------------------
+ * Linear system and Jacobian functions
+ * --------------------------------------------------------------*/
+
+/* Solve the linear systems Ax = b where A = I - gamma*dg/dy.
+   When using a fully implicit method, we are approximating
+   dh/dy as dg/dy. */
+static int SolveReactionLinSys(N_Vector y, N_Vector x, N_Vector b,
+                               realtype gamma, UserData* udata)
+{
+  /* set variable shortcuts */
+  const int dof = udata->grid->dof;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const realtype k2 = udata->k2;
+  const realtype k3 = udata->k3;
+  const realtype k4 = udata->k4;
+  const realtype k6 = udata->k6;
+
+  /* create 4D views of state, RHS and solution vectors */
+  RAJA::View<realtype, RAJA::Layout<4>> Yview(GetVecData(y), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4>> Bview(GetVecData(b), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4>> Xview(GetVecData(x), nxl, nyl, nzl, dof);
+
+  /* solve reaction linear system */
+  auto blocks = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                 RAJA::RangeSegment(0, nyl),
+                                 RAJA::RangeSegment(0, nzl));
+  RAJA::kernel<XYZ_KERNEL_POL>(blocks,
+    [=] DEVICE_FUNC (int i, int j, int k) {
+
+    /* shortcuts to u, v, w for the block */
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+
+    //
+    // compute J = dg/dy
+    //
+
+    /* 1st row: u, v, w */
+    realtype A0 = -k2 * w + 2.0 * k3 * u * v - k4;
+    realtype A1 =  k3 * u * u;
+    realtype A2 = -k2 * u;
+
+    /* 2nd row: u, v, w */
+    realtype A3 =  k2 * w - 2.0 * k3 * u * v;
+    realtype A4 = -k3 * u * u;
+    realtype A5 =  k2 * u;
+
+    /* 3rd row: u, v, w */
+    realtype A6 = -k2 * w;
+    realtype A7 =  0.0;
+    realtype A8 = -k2 * u - k6;
+
+    //
+    // compute A = I - gamma*J
+    //
+
+    A0 = 1. - (gamma * A0);
+    A1 = -gamma * A1;
+    A2 = -gamma * A2;
+    A3 = -gamma * A3;
+    A4 = 1. - (gamma * A4);
+    A5 = -gamma * A5;
+    A6 = -gamma * A6;
+    A7 = -gamma * A7;
+    A8 = 1. - (gamma * A8);
+
+    //
+    // compute x = A^{-1}b
+    //
+
+    realtype scratch_0 = A4*A8;
+    realtype scratch_1 = A1*A5;
+    realtype scratch_2 = A2*A7;
+    realtype scratch_3 = A5*A7;
+    realtype scratch_4 = A1*A8;
+    realtype scratch_5 = A2*A4;
+    realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
+    realtype scratch_7 = A2*A3;
+    realtype scratch_8 = A6*Bview(i,j,k,0);
+    realtype scratch_9 = A2*A6;
+    realtype scratch_10 = A3*Bview(i,j,k,0);
+    realtype scratch_11 = 1.0/A0;
+    realtype scratch_12 = A1*scratch_11;
+    realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
+
+    Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3)
+                               + Bview(i,j,k,1)*(scratch_2 - scratch_4)
+                               + Bview(i,j,k,2)*(scratch_1 - scratch_5));
+    Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5)
+                               + Bview(i,j,k,1)*(A0*A8 - scratch_9)
+                               + A5*scratch_8 - A8*scratch_10 );
+    Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8
+                     + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) /
+                     (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
+
+  });
+
+  return(0);
+}
+
+/* Solve the linear systems Ax = b where A = -dg/dy + gamma.
+   We are approximating dh/dy as dg/dy. */
+static int SolveReactionLinSysRes(N_Vector y, N_Vector x, N_Vector b,
+                                  realtype gamma, UserData* udata)
+{
+  /* set variable shortcuts */
+  const int dof = udata->grid->dof;
+  const int nxl = udata->grid->nxl;
+  const int nyl = udata->grid->nyl;
+  const int nzl = udata->grid->nzl;
+  const realtype k2  = udata->k2;
+  const realtype k3  = udata->k3;
+  const realtype k4  = udata->k4;
+  const realtype k6  = udata->k6;
+
+  /* create 4D views of state, RHS and solution vectors */
+  RAJA::View<realtype, RAJA::Layout<4>> Yview(GetVecData(y), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4>> Bview(GetVecData(b), nxl, nyl, nzl, dof);
+  RAJA::View<realtype, RAJA::Layout<4>> Xview(GetVecData(x), nxl, nyl, nzl, dof);
+
+  /* solve reaction linear system */
+  auto blocks = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
+                                 RAJA::RangeSegment(0, nyl),
+                                 RAJA::RangeSegment(0, nzl));
+  RAJA::kernel<XYZ_KERNEL_POL>(blocks,
+    [=] DEVICE_FUNC (int i, int j, int k) {
+
+    /* shortcuts to u, v, w for the block */
+    const realtype u = Yview(i,j,k,0);
+    const realtype v = Yview(i,j,k,1);
+    const realtype w = Yview(i,j,k,2);
+
+    //
+    // compute dg/dy
+    //
+
+    /* 1st row: u, v, w */
+    realtype A0 = -k2 * w + 2.0 * k3 * u * v - k4;
+    realtype A1 =  k3 * u * u;
+    realtype A2 = -k2 * u;
+
+    /* 2nd row: u, v, w */
+    realtype A3 =  k2 * w - 2.0 * k3 * u * v;
+    realtype A4 = -k3 * u * u;
+    realtype A5 =  k2 * u;
+
+    /* 3rd row: u, v, w */
+    realtype A6 = -k2 * w;
+    realtype A7 =  0.0;
+    realtype A8 = -k2 * u - k6;
+
+    //
+    // compute A = -dg/dy + gamma*diag(df/dydot)
+    // where diag(df/dydot) is approximated as
+    // diag([udot, vdot, wdot])
+    //
+
+    A0 = -A0 + gamma;
+    A1 = -A1;
+    A2 = -A2;
+    A3 = -A3;
+    A4 = -A4 + gamma;
+    A5 = -A5;
+    A6 = -A6;
+    A7 = -A7;
+    A8 = -A8 + gamma;
+
+    //
+    // compute x = A^{-1}b
+    //
+
+    realtype scratch_0 = A4*A8;
+    realtype scratch_1 = A1*A5;
+    realtype scratch_2 = A2*A7;
+    realtype scratch_3 = A5*A7;
+    realtype scratch_4 = A1*A8;
+    realtype scratch_5 = A2*A4;
+    realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
+    realtype scratch_7 = A2*A3;
+    realtype scratch_8 = A6*Bview(i,j,k,0);
+    realtype scratch_9 = A2*A6;
+    realtype scratch_10 = A3*Bview(i,j,k,0);
+    realtype scratch_11 = 1.0/A0;
+    realtype scratch_12 = A1*scratch_11;
+    realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
+
+    Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3)
+                               + Bview(i,j,k,1)*(scratch_2 - scratch_4)
+                               + Bview(i,j,k,2)*(scratch_1 - scratch_5));
+    Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5)
+                               + Bview(i,j,k,1)*(A0*A8 - scratch_9)
+                               + A5*scratch_8 - A8*scratch_10 );
+    Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8
+                     + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) /
+                     (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
+
+  });
+
+  return(0);
+}
+
+
+/* --------------------------------------------------------------
+ * Preconditioner functions
+ * --------------------------------------------------------------*/
+
+/* Solves Pz = r where P = I - gamma * dg/dy */
+static int PSolve(realtype t, N_Vector y, N_Vector ydot, N_Vector r,
+                  N_Vector z, realtype gamma, realtype delta, int lr,
+                  void *user_data)
+{
+  /* local variables */
+  UserData* udata = (UserData*) user_data;
+  int       retval;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* solve the task-local linear system Pz = r */
+  retval = SolveReactionLinSys(y, z, r, gamma, udata);
+
+  return(retval);
+}
+
+/* Solves Pz = r where P = -dg/dy + gamma */
+static int PSolveRes(realtype t, N_Vector y, N_Vector ydot, N_Vector F,
+                     N_Vector r, N_Vector z, realtype cj, realtype delta,
+                     void *user_data)
+{
+  /* local variables */
+  UserData* udata = (UserData*) user_data;
+  int       retval;
+
+  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
+
+  /* solve the task-local linear system Pz = r */
+  retval = SolveReactionLinSysRes(y, z, r, cj, udata);
+
+  return(retval);
+}
+
+
+#endif
diff --git a/benchmarks/advection_reaction_3D/rhs3D.hpp b/benchmarks/advection_reaction_3D/rhs3D.hpp
deleted file mode 100644
index 874e5cb8bb..0000000000
--- a/benchmarks/advection_reaction_3D/rhs3D.hpp
+++ /dev/null
@@ -1,700 +0,0 @@
-/* -----------------------------------------------------------------------------
- * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL
- * -----------------------------------------------------------------------------
- * SUNDIALS Copyright Start
- * Copyright (c) 2002-2023, Lawrence Livermore National Security
- * and Southern Methodist University.
- * All rights reserved.
- *
- * See the top-level LICENSE and NOTICE files for details.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- * SUNDIALS Copyright End
- * -----------------------------------------------------------------------------*/
-
-#ifndef ADVECTION_REACTION_3D_RHS_HPP
-#define ADVECTION_REACTION_3D_RHS_HPP
-
-#include "advection_reaction_3D.hpp"
-
-using raja_xyz_tuple = camp::tuple<RAJA::RangeSegment, RAJA::RangeSegment, RAJA::RangeSegment>;
-
-/* --------------------------------------------------------------
- * Right hand side (RHS) and residual functions
- * --------------------------------------------------------------*/
-
-/* Compute the advection term f(t,y) = -c (grad * y). This is done using
-   upwind 1st order finite differences. */
-static int Advection(realtype t, N_Vector y, N_Vector ydot, void* user_data)
-{
-  /* access problem data */
-  UserData* udata = (UserData*) user_data;
-
-  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
-
-  /* set variable shortcuts */
-  const int      nxl = udata->grid->nxl;
-  const int      nyl = udata->grid->nyl;
-  const int      nzl = udata->grid->nzl;
-  const int      dof = udata->grid->dof;
-  const realtype c   = udata->c;
-  const realtype cx  = -c / udata->grid->dx;
-  const realtype cy  = -c / udata->grid->dy;
-  const realtype cz  = -c / udata->grid->dz;
-
-  /* local variables */
-  int retval;
-
-  /* begin exchanging boundary information */
-  if (udata->grid->nprocs() > 1)
-  {
-    retval = ExchangeAllStart(y, udata);
-    if (check_retval(&retval, "ExchangeAllStart", 1, udata->myid))
-      return(-1);
-  }
-
-  /* set output to zero */
-  N_VConst(0.0, ydot);
-
-  /* create views of the data */
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(GetVecData(y),
-                                                     nxl, nyl, nzl, dof);
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > dYview(GetVecData(ydot),
-                                                      nxl, nyl, nzl, dof);
-
-  /* iterate over domain interior, computing advection */
-  if (c > 0.0)
-  {
-    /* flow moving in the positive x,y,z direction */
-    auto range = RAJA::make_tuple(RAJA::RangeSegment(1, nxl),
-                                  RAJA::RangeSegment(1, nyl),
-                                  RAJA::RangeSegment(1, nzl));
-
-    RAJA::kernel<XYZ_KERNEL_POL>(range,
-      [=] DEVICE_FUNC (int i, int j, int k) {
-      const realtype u_ijk = Yview(i,j,k,0);
-      const realtype v_ijk = Yview(i,j,k,1);
-      const realtype w_ijk = Yview(i,j,k,2);
-
-      // grad * u
-      dYview(i,j,k,0) =  cz * (u_ijk - Yview(i,j,k-1,0)); // du/dz
-      dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j-1,k,0)); // du/dy
-      dYview(i,j,k,0) += cx * (u_ijk - Yview(i-1,j,k,0)); // du/dx
-
-      // grad * v
-      dYview(i,j,k,1) =  cz * (v_ijk - Yview(i,j,k-1,1)); // dv/dz
-      dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j-1,k,1)); // dv/dy
-      dYview(i,j,k,1) += cx * (v_ijk - Yview(i-1,j,k,1)); // dv/dx
-
-      // grad * w
-      dYview(i,j,k,2) =  cz * (w_ijk - Yview(i,j,k-1,2)); // dw/dz
-      dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j-1,k,2)); // dw/dy
-      dYview(i,j,k,2) += cx * (w_ijk - Yview(i-1,j,k,2)); // dw/dx
-    });
-  }
-  else if (c < 0.0)
-  {
-    /* flow moving in the negative x,y,z direction */
-    auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1),
-                                  RAJA::RangeSegment(0, nyl-1),
-                                  RAJA::RangeSegment(0, nzl-1));
-    RAJA::kernel<XYZ_KERNEL_POL>(range,
-      [=] DEVICE_FUNC (int i, int j, int k) {
-      const realtype u_ijk = Yview(i,j,k,0);
-      const realtype v_ijk = Yview(i,j,k,1);
-      const realtype w_ijk = Yview(i,j,k,2);
-
-      // grad * u
-      dYview(i,j,k,0) =  cz * (u_ijk - Yview(i,j,k+1,0)); // du/dz
-      dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j+1,k,0)); // du/dy
-      dYview(i,j,k,0) += cx * (u_ijk - Yview(i+1,j,k,0)); // du/dx
-
-      // grad * v
-      dYview(i,j,k,1) =  cz * (v_ijk - Yview(i,j,k+1,1)); // dv/dz
-      dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j+1,k,1)); // dv/dy
-      dYview(i,j,k,1) += cx * (v_ijk - Yview(i+1,j,k,1)); // dv/dx
-
-      // grad * w
-      dYview(i,j,k,2) =  cz * (w_ijk - Yview(i,j,k+1,2)); // dw/dz
-      dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j+1,k,2)); // dw/dy
-      dYview(i,j,k,2) += cx * (w_ijk - Yview(i+1,j,k,2)); // dw/dx
-    });
-  }
-
-  /* finish exchanging boundary information */
-  if (udata->grid->nprocs() > 1)
-  {
-    retval = ExchangeAllEnd(udata);
-    if (check_retval(&retval, "ExchangeAllEnd", 1, udata->myid))
-      return(-1);
-  }
-
-  /* compute advection at process boundaries */
-  if (c > 0.0)
-  {
-    if (udata->grid->npx > 1)
-    {
-      /* Flow moving in the positive x,y,z direction:
-      *  boundaries are west face, south face, front face */
-
-      RAJA::View<realtype, RAJA::Layout<NDIMS> >
-        Yim1jk(udata->grid->getRecvBuffer("WEST"), nyl, nzl, dof); // Wrecv should have data that was sent from East
-
-      auto west_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl),
-                                        RAJA::RangeSegment(0, nzl),
-                                        RAJA::RangeSegment(0, dof));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(west_face,
-        [=] DEVICE_FUNC (int j, int k, int l) {
-        dYview(0,j,k,l) += cx * (Yview(0,j,k,l) - Yim1jk(j,k,l)); // d/dx
-      });
-    }
-    else
-    {
-      auto range = RAJA::make_tuple(RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(range,
-        [=] DEVICE_FUNC (int i, int j, int k) {
-        const realtype u_ijk = Yview(i,j,k,0);
-        const realtype v_ijk = Yview(i,j,k,1);
-        const realtype w_ijk = Yview(i,j,k,2);
-
-        dYview(i,j,k,0) = cx * (u_ijk - Yview(nxl-1,j,k,0)); // du/dx
-        dYview(i,j,k,1) = cx * (v_ijk - Yview(nxl-1,j,k,1)); // dv/dx
-        dYview(i,j,k,2) = cx * (w_ijk - Yview(nxl-1,j,k,2)); // dw/dx
-      });
-
-    }
-
-    if (udata->grid->npy > 1)
-    {
-      RAJA::View<realtype, RAJA::Layout<NDIMS> >
-        Yijm1k(udata->grid->getRecvBuffer("SOUTH"), nxl, nzl, dof); // Nrecv should have data that was sent from North
-
-      auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
-                                         RAJA::RangeSegment(0, nzl),
-                                         RAJA::RangeSegment(0, dof));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(south_face,
-        [=] DEVICE_FUNC (int i, int k, int l) {
-        dYview(i,0,k,l) += cy * (Yview(i,0,k,l) - Yijm1k(i,k,l)); // d/dy
-      });
-    }
-    else
-    {
-      auto range = RAJA::make_tuple(RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(range,
-        [=] DEVICE_FUNC (int i, int j, int k) {
-        const realtype u_ijk = Yview(i,j,k,0);
-        const realtype v_ijk = Yview(i,j,k,1);
-        const realtype w_ijk = Yview(i,j,k,2);
-
-        dYview(i,j,k,0) += cy * (u_ijk - Yview(i,nyl-1,k,0)); // du/dy
-        dYview(i,j,k,1) += cy * (v_ijk - Yview(i,nyl-1,k,1)); // dv/dy
-        dYview(i,j,k,2) += cy * (w_ijk - Yview(i,nyl-1,k,2)); // dw/dy
-      });
-    }
-
-    if (udata->grid->npz > 1)
-    {
-      RAJA::View<realtype, RAJA::Layout<NDIMS> >
-        Yijkm1(udata->grid->getRecvBuffer("FRONT"), nxl, nyl, dof); // Frecv should have data that was sent from Back
-
-      auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl),
-                                         RAJA::RangeSegment(0, nyl),
-                                         RAJA::RangeSegment(0, dof));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(front_face,
-        [=] DEVICE_FUNC (int i, int j, int l) {
-        dYview(i,j,0,l) += cz * (Yview(i,j,0,l) - Yijkm1(i,j,l)); // d/dz
-      });
-
-    }
-    else
-    {
-      auto range = RAJA::make_tuple(RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1),
-                                    RAJA::RangeSegment(0, 1));
-
-      RAJA::kernel<XYZ_KERNEL_POL>(range,
-        [=] DEVICE_FUNC (int i, int j, int k) {
-        const realtype u_ijk = Yview(i,j,k,0);
-        const realtype v_ijk = Yview(i,j,k,1);
-        const realtype w_ijk = Yview(i,j,k,2);
-
-        dYview(i,j,k,0) +=  cz * (u_ijk - Yview(i,j,nzl-1,0)); // du/dz
-        dYview(i,j,k,1) +=  cz * (v_ijk - Yview(i,j,nzl-1,1)); // dv/dz
-        dYview(i,j,k,2) +=  cz * (w_ijk - Yview(i,j,nzl-1,2)); // dw/dz
-      });
-    }
-  }
-  else if (c < 0.0)
-  {
-    if (udata->grid->nprocs() != 1)
-    {
-      /* Flow moving in the negative x,y,z direction:
-      *  boundaries are west face, south face, and front face */
-
-      RAJA::View<realtype, RAJA::Layout<3> >
-        Yip1jk(udata->grid->getRecvBuffer("EAST"), nyl, nzl, dof);
-      RAJA::View<realtype, RAJA::Layout<3> >
-        Yijp1k(udata->grid->getRecvBuffer("NORTH"), nxl, nzl, dof);
-      RAJA::View<realtype, RAJA::Layout<3> >
-        Yijkp1(udata->grid->getRecvBuffer("BACK"), nxl, nyl, dof);
-
-      auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1),
-                                         RAJA::RangeSegment(0, nyl-1),
-                                         RAJA::RangeSegment(0, dof));
-      RAJA::kernel<XYZ_KERNEL_POL>(front_face,
-        [=] DEVICE_FUNC (int i, int j, int l) {
-        dYview(i,j,0,l) =  cz * (Yview(i,j,0,l) - Yijkp1(i,nzl+1,l)); // d/dz
-        dYview(i,j,0,l) += cy * (Yview(i,j,0,l) - Yijp1k(0,j+1,l));   // d/dy
-        dYview(i,j,0,l) += cx * (Yview(i,j,0,l) - Yip1jk(i+1,0,l));   // d/dx
-      });
-
-      auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1),
-                                         RAJA::RangeSegment(0, nzl-1),
-                                         RAJA::RangeSegment(0, dof));
-      RAJA::kernel<XYZ_KERNEL_POL>(south_face,
-        [=] DEVICE_FUNC (int i, int k, int l) {
-        dYview(i,0,k,l) =  cz * (Yview(i,0,k,l) - Yijkp1(i,k+1,l));   // d/dz
-        dYview(i,0,k,l) += cy * (Yview(i,0,k,l) - Yijp1k(0,nyl+1,l)); // d/dy
-        dYview(i,0,k,l) += cx * (Yview(i,0,k,l) - Yip1jk(i+1,0,l));   // d/dx
-      });
-
-      auto east_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl-1),
-                                        RAJA::RangeSegment(0, nzl-1),
-                                        RAJA::RangeSegment(0, dof));
-      RAJA::kernel<XYZ_KERNEL_POL>(east_face,
-        [=] DEVICE_FUNC (int j, int k, int l) {
-        dYview(0,j,k,l) =  cz * (Yview(0,j,k,l) - Yijkp1(0,k+1,l));   // d/dz
-        dYview(0,j,k,l) += cy * (Yview(0,j,k,l) - Yijp1k(0,j+1,l));   // d/dy
-        dYview(0,j,k,l) += cx * (Yview(0,j,k,l) - Yip1jk(nxl+1,0,l)); // d/dx
-      });
-    }
-    else
-    {
-      auto range = RAJA::make_tuple(RAJA::RangeSegment(nxl-2, nxl),
-                                    RAJA::RangeSegment(nyl-2, nyl),
-                                    RAJA::RangeSegment(nzl-2, nzl));
-      RAJA::kernel<XYZ_KERNEL_POL>(range,
-        [=] DEVICE_FUNC (int i, int j, int k) {
-        const realtype u_ijk = Yview(i,j,k,0);
-        const realtype v_ijk = Yview(i,j,k,1);
-        const realtype w_ijk = Yview(i,j,k,2);
-
-        // grad * u
-        dYview(i,j,k,0) =  cz * (u_ijk - Yview(i,j,0,0)); // du/dz
-        dYview(i,j,k,0) += cy * (u_ijk - Yview(i,0,k,0)); // du/dy
-        dYview(i,j,k,0) += cx * (u_ijk - Yview(0,j,k,0)); // du/dx
-
-        // grad * v
-        dYview(i,j,k,1) =  cz * (v_ijk - Yview(i,j,0,1)); // dv/dz
-        dYview(i,j,k,1) += cy * (v_ijk - Yview(i,0,k,1)); // dv/dy
-        dYview(i,j,k,1) += cx * (v_ijk - Yview(0,j,k,1)); // dv/dx
-
-        // grad * w
-        dYview(i,j,k,2) =  cz * (w_ijk - Yview(i,j,0,2)); // dw/dz
-        dYview(i,j,k,2) += cy * (w_ijk - Yview(i,0,k,2)); // dw/dy
-        dYview(i,j,k,2) += cx * (w_ijk - Yview(0,j,k,2)); // dw/dx
-      });
-    }
-  }
-
-  /* return success */
-  return(0);
-}
-
-
-/* Compute the reaction term g(t,y). */
-static int Reaction(realtype t, N_Vector y, N_Vector ydot, void* user_data)
-{
-  /* access problem data */
-  UserData* udata = (UserData*) user_data;
-
-  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
-
-  /* set variable shortcuts */
-  const realtype A  = udata->A;
-  const realtype B  = udata->B;
-  const realtype k1 = udata->k1;
-  const realtype k2 = udata->k2;
-  const realtype k3 = udata->k3;
-  const realtype k4 = udata->k4;
-  const realtype k5 = udata->k5;
-  const realtype k6 = udata->k6;
-
-  /* local variables */
-  realtype* Ydata  = NULL;
-  realtype* dYdata = NULL;
-
-  /* access data arrays */
-  Ydata = GetVecData(y);
-  if (check_retval((void *)Ydata, "GetVecData", 0, udata->myid))
-    return(-1);
-
-  dYdata = GetVecData(ydot);
-  if (check_retval((void *)dYdata, "GetVecData", 0, udata->myid))
-    return(-1);
-
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(GetVecData(y),
-                                                     udata->grid->nxl,
-                                                     udata->grid->nyl,
-                                                     udata->grid->nzl,
-                                                     udata->grid->dof);
-
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > dYview(GetVecData(ydot),
-                                                      udata->grid->nxl,
-                                                      udata->grid->nyl,
-                                                      udata->grid->nzl,
-                                                      udata->grid->dof);
-
-  auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
-                                RAJA::RangeSegment(0, udata->grid->nyl),
-                                RAJA::RangeSegment(0, udata->grid->nzl));
-
-  /* iterate over domain, computing reactions */
-  if (udata->add_reactions)
-  {
-    /* when we are not additively splitting the rhs, we add to ydot
-       as we expect it to hold the advection term already */
-    RAJA::kernel<XYZ_KERNEL_POL>(range,
-      [=] DEVICE_FUNC (int i, int j, int k) {
-      const realtype u = Yview(i,j,k,0);
-      const realtype v = Yview(i,j,k,1);
-      const realtype w = Yview(i,j,k,2);
-      dYview(i,j,k,0) += k1 * A - k2 * w * u + k3 * u * u * v - k4 * u;
-      dYview(i,j,k,1) += k2 * w * u - k3 * u * u * v;
-      dYview(i,j,k,2) += -k2 * w * u + k5 * B - k6 * w;
-    });
-  }
-  else
-  {
-    /* set output to zero */
-    N_VConst(0.0, ydot);
-
-    RAJA::kernel<XYZ_KERNEL_POL>(range,
-      [=] DEVICE_FUNC (int i, int j, int k) {
-      const realtype u = Yview(i,j,k,0);
-      const realtype v = Yview(i,j,k,1);
-      const realtype w = Yview(i,j,k,2);
-      dYview(i,j,k,0) = k1 * A - k2 * w * u + k3 * u * u * v - k4 * u;
-      dYview(i,j,k,1) = k2 * w * u - k3 * u * u * v;
-      dYview(i,j,k,2) = -k2 * w * u + k5 * B - k6 * w;
-    });
-  }
-
-  /* return success */
-    return(0);
-}
-
-
-/* Compute the RHS as h(t,y) = f(t,y) + g(t,y). */
-static int AdvectionReaction(realtype t, N_Vector y, N_Vector ydot,
-                             void *user_data)
-{
-  /* access problem data */
-  UserData* udata = (UserData*) user_data;
-  int retval;
-
-  /* NOTE: The order in which Advection and Reaction are
-           called is critical here. Advection must be
-           computed first. */
-  retval = Advection(t, y, ydot, user_data);
-  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
-
-  retval = Reaction(t, y, ydot, user_data);
-  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
-
-  /* return success */
-  return(0);
-}
-
-/* Compute the residual F(t,y,y') = ydot - h(t,y) = 0. */
-static int AdvectionReactionResidual(realtype t, N_Vector y, N_Vector ydot,
-                                     N_Vector F, void *user_data)
-{
-  /* access problem data */
-  UserData* udata = (UserData*) user_data;
-  int retval;
-
-  /* NOTE: The order in which Advection and Reaction are
-           called is critical here. Advection must be
-           computed first. */
-  retval = Advection(t, y, F, user_data); /* F = -c y_x */
-  if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1);
-
-  retval = Reaction(t, y, F, user_data);  /* F = -c y_x + g(t,y) */
-  if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1);
-
-  /* F = ydot - h(t,y) = ydot + c y_x - g(t,y) */
-  N_VLinearSum(1.0, ydot, -1.0, F, F);
-
-  /* return success */
-  return(0);
-}
-
-/* --------------------------------------------------------------
- * Linear system and Jacobian functions
- * --------------------------------------------------------------*/
-
-/* Solve the linear systems Ax = b where A = I - gamma*dg/dy.
-   When using a fully implicit method, we are approximating
-   dh/dy as dg/dy. */
-static int SolveReactionLinSys(N_Vector y, N_Vector x, N_Vector b,
-                               realtype gamma, raja_xyz_tuple blocks,
-                               UserData* udata)
-{
-  /* shortcuts */
-  int       dof, nxl, nyl, nzl;
-  realtype  k2, k3, k4, k6;
-
-  /* set shortcuts */
-  dof = udata->grid->dof;
-  nxl = udata->grid->nxl;
-  nyl = udata->grid->nyl;
-  nzl = udata->grid->nzl;
-  k2  = udata->k2;
-  k3  = udata->k3;
-  k4  = udata->k4;
-  k6  = udata->k6;
-  
-  /* create views of the data */
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(GetVecData(y),
-                                                     nxl, nyl, nzl, dof);
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Bview(GetVecData(b),
-                                                     nxl, nyl, nzl, dof);
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Xview(GetVecData(x),
-                                                     nxl, nyl, nzl, dof);
-
-  RAJA::kernel<XYZ_KERNEL_POL>(blocks,
-    [=] DEVICE_FUNC (int i, int j, int k) {
-
-    /* and the corresponding vectors */
-    realtype *b = &(Bview(i,j,k,0));
-    realtype *x = &(Xview(i,j,k,0));
-
-    /* shortcuts to u, v, w for the block */
-    realtype u = Yview(i,j,k,0);
-    realtype v = Yview(i,j,k,1);
-    realtype w = Yview(i,j,k,2);
-
-    realtype A0, A1, A2, A3, A4, A5, A6, A7, A8;
-
-    //
-    // compute J = dg/dy
-    //
-
-    /* 1st row: u, v, w */
-    A0 = -k2 * w + 2.0 * k3 * u * v - k4;
-    A1 =  k3 * u * u;
-    A2 = -k2 * u;
-
-    /* 2nd row: u, v, w */
-    A3 =  k2 * w - 2.0 * k3 * u * v;
-    A4 = -k3 * u * u;
-    A5 =  k2 * u;
-
-    /* 3rd row: u, v, w */
-    A6 = -k2 * w;
-    A7 =  0.0;
-    A8 = -k2 * u - k6;
-
-    //
-    // compute A = I - gamma*J
-    //
-
-    A0 = 1. - (gamma * A0);
-    A1 = -gamma * A1;
-    A2 = -gamma * A2;
-    A3 = -gamma * A3;
-    A4 = 1. - (gamma * A4);
-    A5 = -gamma * A5;
-    A6 = -gamma * A6;
-    A7 = -gamma * A7;
-    A8 = 1. - (gamma * A8);
-
-    //
-    // compute x = A^{-1}b
-    //
-
-    realtype scratch_0 = A4*A8;
-    realtype scratch_1 = A1*A5;
-    realtype scratch_2 = A2*A7;
-    realtype scratch_3 = A5*A7;
-    realtype scratch_4 = A1*A8;
-    realtype scratch_5 = A2*A4;
-    realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
-    realtype scratch_7 = A2*A3;
-    realtype scratch_8 = A6*b[0];
-    realtype scratch_9 = A2*A6;
-    realtype scratch_10 = A3*b[0];
-    realtype scratch_11 = 1.0/A0;
-    realtype scratch_12 = A1*scratch_11;
-    realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
-
-    x[0] = scratch_6*(b[0]*scratch_0 - b[0]*scratch_3 + b[1]*scratch_2 - b[1]*scratch_4 + b[2]*scratch_1 - b[2]*scratch_5);
-    x[1] = scratch_6*(-A0*A5*b[2] + A0*A8*b[1] + A5*scratch_8 - A8*scratch_10 - b[1]*scratch_9 + b[2]*scratch_7);
-    x[2] = (-b[2] + scratch_11*scratch_8 + scratch_13*(b[1] - scratch_10*scratch_11))/(-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
-  });
-
-  return(0);
-}
-
-/* Solve the linear systems Ax = b where A = -dg/dy + gamma.
-   We are approximating dh/dy as dg/dy. */
-static int SolveReactionLinSysRes(N_Vector y, N_Vector x, N_Vector b,
-                                  realtype gamma, raja_xyz_tuple blocks,
-                                  UserData* udata)
-{
-  /* shortcuts */
-  int       dof, nxl, nyl, nzl;
-  realtype  k2, k3, k4, k6;
-
-  /* set shortcuts */
-  dof = udata->grid->dof;
-  nxl = udata->grid->nxl;
-  nyl = udata->grid->nyl;
-  nzl = udata->grid->nzl;
-  k2    = udata->k2;
-  k3    = udata->k3;
-  k4    = udata->k4;
-  k6    = udata->k6;
-
-  /* create views of the data */
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Yview(GetVecData(y),
-                                                     nxl, nyl, nzl, dof);
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Bview(GetVecData(b),
-                                                     nxl, nyl, nzl, dof);
-  RAJA::View<realtype, RAJA::Layout<NDIMS+1> > Xview(GetVecData(x),
-                                                     nxl, nyl, nzl, dof);
-
-  RAJA::kernel<XYZ_KERNEL_POL>(blocks,
-    [=] DEVICE_FUNC (int i, int j, int k) {
-
-    /* and the corresponding vectors */
-    realtype *b = &(Bview(i,j,k,0));
-    realtype *x = &(Xview(i,j,k,0));
-
-    /* shortcuts to u, v, w for the block */
-    realtype u = Yview(i,j,k,0);
-    realtype v = Yview(i,j,k,1);
-    realtype w = Yview(i,j,k,2);
-
-    realtype A0, A1, A2, A3, A4, A5, A6, A7, A8;
-
-    //
-    // compute dg/dy
-    //
-
-    /* 1st row: u, v, w */
-    A0 = -k2 * w + 2.0 * k3 * u * v - k4;
-    A1 =  k3 * u * u;
-    A2 = -k2 * u;
-
-    /* 2nd row: u, v, w */
-    A3 =  k2 * w - 2.0 * k3 * u * v;
-    A4 = -k3 * u * u;
-    A5 =  k2 * u;
-
-    /* 3rd row: u, v, w */
-    A6 = -k2 * w;
-    A7 =  0.0;
-    A8 = -k2 * u - k6;
-
-    //
-    // compute A = -dg/dy + gamma*diag(df/dydot)
-    // where diag(df/dydot) is approximated as
-    // diag([udot, vdot, wdot])
-    //
-
-    A0 = -A0 + gamma;
-    A1 = -A1;
-    A2 = -A2;
-    A3 = -A3;
-    A4 = -A4 + gamma;
-    A5 = -A5;
-    A6 = -A6;
-    A7 = -A7;
-    A8 = -A8 + gamma;
-
-    //
-    // compute x = A^{-1}b
-    //
-
-    realtype scratch_0 = A4*A8;
-    realtype scratch_1 = A1*A5;
-    realtype scratch_2 = A2*A7;
-    realtype scratch_3 = A5*A7;
-    realtype scratch_4 = A1*A8;
-    realtype scratch_5 = A2*A4;
-    realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5);
-    realtype scratch_7 = A2*A3;
-    realtype scratch_8 = A6*b[0];
-    realtype scratch_9 = A2*A6;
-    realtype scratch_10 = A3*b[0];
-    realtype scratch_11 = 1.0/A0;
-    realtype scratch_12 = A1*scratch_11;
-    realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4);
-
-    x[0] = scratch_6*(b[0]*scratch_0 - b[0]*scratch_3 + b[1]*scratch_2 - b[1]*scratch_4 + b[2]*scratch_1 - b[2]*scratch_5);
-    x[1] = scratch_6*(-A0*A5*b[2] + A0*A8*b[1] + A5*scratch_8 - A8*scratch_10 - b[1]*scratch_9 + b[2]*scratch_7);
-    x[2] = (-b[2] + scratch_11*scratch_8 + scratch_13*(b[1] - scratch_10*scratch_11))/(-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7));
-  });
-
-  return(0);
-}
-
-
-/* --------------------------------------------------------------
- * Preconditioner functions
- * --------------------------------------------------------------*/
-
-/* Solves Pz = r where P = I - gamma * dg/dy */
-static int PSolve(realtype t, N_Vector y, N_Vector ydot, N_Vector r,
-                  N_Vector z, realtype gamma, realtype delta, int lr,
-
-                  void *user_data)
-{
-  /* local variables */
-  UserData* udata = (UserData*) user_data;
-  int       retval;
-
-  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
-
-  /* solve the task-local linear system Pz = r */
-  auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
-                                RAJA::RangeSegment(0, udata->grid->nyl),
-                                RAJA::RangeSegment(0, udata->grid->nzl));
-  retval = SolveReactionLinSys(y, z, r, gamma, range, udata);
-
-  return(retval);
-}
-
-/* Solves Pz = r where P = -dg/dy + gamma */
-static int PSolveRes(realtype t, N_Vector y, N_Vector ydot, N_Vector F,
-                     N_Vector r, N_Vector z, realtype cj, realtype delta,
-                     void *user_data)
-{
-  /* local variables */
-  UserData* udata = (UserData*) user_data;
-  int       retval;
-
-  SUNDIALS_CXX_MARK_FUNCTION(udata->prof);
-
-  /* solve the task-local linear system Pz = r */
-  auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl),
-                                RAJA::RangeSegment(0, udata->grid->nyl),
-                                RAJA::RangeSegment(0, udata->grid->nzl));
-  retval = SolveReactionLinSysRes(y, z, r, cj, range, udata);
-
-  return(retval);
-}
-
-
-#endif
diff --git a/benchmarks/advection_reaction_3D/scripts/make_plots.py b/benchmarks/advection_reaction_3D/scripts/make_plots.py
new file mode 100755
index 0000000000..7728562510
--- /dev/null
+++ b/benchmarks/advection_reaction_3D/scripts/make_plots.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python
+# ------------------------------------------------------------------------------
+# Programmer(s):  Daniel R. Reynolds @ SMU
+# ------------------------------------------------------------------------------
+# SUNDIALS Copyright Start
+# Copyright (c) 2002-2023, Lawrence Livermore National Security
+# and Southern Methodist University.
+# All rights reserved.
+#
+# See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# SUNDIALS Copyright End
+# ------------------------------------------------------------------------------
+# matplotlib-based plotting script for the advection_reaction_3D benchmark codes
+# ------------------------------------------------------------------------------
+
+# imports
+from os.path import exists
+import numpy as np
+import matplotlib.pyplot as plt
+
+# ------------------------------------------------------------------------------
+
+# utility functions
+def parallel_coords(rank):
+    if (rank == 0):
+        return [0, 0, 0]
+    if (rank == 1):
+        return [0, 0, 1]
+    if (rank == 2):
+        return [0, 1, 0]
+    if (rank == 3):
+        return [0, 1, 1]
+    if (rank == 4):
+        return [1, 0, 0]
+    if (rank == 5):
+        return [1, 0, 1]
+    if (rank == 6):
+        return [1, 1, 0]
+    if (rank == 7):
+        return [1, 1, 1]
+
+def xslice(u,it,ix):
+    return u[it,ix,:,:]
+
+def yslice(u,it,iy):
+    return u[it,:,iy,:]
+
+def zslice(u,it,iz):
+    return u[it,:,:,iz]
+
+def xproj(u,it):
+    return np.average(u[it,:,:,:], axis=0)
+
+def yproj(u,it):
+    return np.average(u[it,:,:,:], axis=1)
+
+def zproj(u,it):
+    return np.average(u[it,:,:,:], axis=2)
+
+def myplot(axis, X, Y, Z, xlabel='none', ylabel='none'):
+    frame = axis.contourf(X, Y, Z)
+    plt.colorbar(frame, ax=axis)
+    if (xlabel != 'none'):
+        axis.set_xlabel(xlabel)
+    if (ylabel != 'none'):
+        axis.set_ylabel(ylabel)
+
+
+
+# read time mesh
+times = np.loadtxt("t.000000.txt")
+nt = times.size
+
+# read spatial mesh
+mesh = np.loadtxt("mesh.txt", dtype=float)
+x = mesh[0,:]
+y = mesh[1,:]
+z = mesh[2,:]
+nx = x.size
+ny = y.size
+nz = z.size
+
+# ensure that the run used exactly 1 or 8 MPI ranks
+for i in range(9):
+    if (exists("u.00000" + str(i) + ".txt" ) and
+        not exists("u.00000" + str(i+1) + ".txt" )):
+        nprocs = i+1
+if ((nprocs != 1) and (nprocs != 8)):
+    print("make_plots.py error: run must have used either 1 or 8 MPI ranks")
+    exit()
+
+# load data for run
+if (nprocs == 1):
+    u = np.zeros((nt,nx,ny,nz), dtype=float)
+    v = np.zeros((nt,nx,ny,nz), dtype=float)
+    w = np.zeros((nt,nx,ny,nz), dtype=float)
+    udata = np.loadtxt("u.000000.txt")
+    vdata = np.loadtxt("v.000000.txt")
+    wdata = np.loadtxt("w.000000.txt")
+    if (nt != udata.shape[0]):
+        print("make_plots.py error: mesh and data have incompatible sizes")
+        exit()
+    if (nx*ny*nz != udata.shape[1]):
+        print("make_plots.py error: mesh and data have incompatible sizes")
+        exit()
+    for it in range(nt):
+        u[it,:,:,:] = np.reshape(udata[it,:], (nx,ny,nz), order='C')
+        v[it,:,:,:] = np.reshape(vdata[it,:], (nx,ny,nz), order='C')
+        w[it,:,:,:] = np.reshape(wdata[it,:], (nx,ny,nz), order='C')
+else:
+    u = np.zeros((nt,nx,ny,nz), dtype=float)
+    v = np.zeros((nt,nx,ny,nz), dtype=float)
+    w = np.zeros((nt,nx,ny,nz), dtype=float)
+    nxl = nx//2
+    nyl = ny//2
+    nzl = nz//2
+    for ip in range(8):
+        udata = np.loadtxt("u.00000" + str(ip) + ".txt")
+        vdata = np.loadtxt("v.00000" + str(ip) + ".txt")
+        wdata = np.loadtxt("w.00000" + str(ip) + ".txt")
+        if (nt != udata.shape[0]):
+            print("make_plots.py error: mesh and data have incompatible sizes")
+            exit()
+        if (nxl*nyl*nzl != udata.shape[1]):
+            print("make_plots.py error: mesh and data have incompatible sizes")
+            exit()
+        coords = parallel_coords(ip)
+        ilo = coords[0]*nxl
+        ihi = (coords[0]+1)*nxl
+        jlo = coords[1]*nyl
+        jhi = (coords[1]+1)*nyl
+        klo = coords[2]*nzl
+        khi = (coords[2]+1)*nzl
+        for it in range(nt):
+            u[it,ilo:ihi,jlo:jhi,klo:khi] = np.reshape(udata[it,:], (nxl,nyl,nzl), order='C')
+            v[it,ilo:ihi,jlo:jhi,klo:khi] = np.reshape(vdata[it,:], (nxl,nyl,nzl), order='C')
+            w[it,ilo:ihi,jlo:jhi,klo:khi] = np.reshape(wdata[it,:], (nxl,nyl,nzl), order='C')
+
+
+# set meshgrid objects
+xy0,xy1 = np.meshgrid(x, y)
+yz0,yz1 = np.meshgrid(y, z)
+xz0,xz1 = np.meshgrid(x, z)
+
+# generate plots
+sliceidx = 25
+tslice = [0, 5, 10]
+figsize = (9,7)
+
+#    xy slices at various times
+plt.figure(1)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, xy0, xy1, zslice(u,tslice[0],sliceidx), ylabel = 'u')
+myplot(ax2, xy0, xy1, zslice(u,tslice[1],sliceidx))
+myplot(ax3, xy0, xy1, zslice(u,tslice[2],sliceidx))
+myplot(ax4, xy0, xy1, zslice(v,tslice[0],sliceidx), ylabel = 'v')
+myplot(ax5, xy0, xy1, zslice(v,tslice[1],sliceidx))
+myplot(ax6, xy0, xy1, zslice(v,tslice[2],sliceidx))
+myplot(ax7, xy0, xy1, zslice(w,tslice[0],sliceidx), ylabel = 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, xy0, xy1, zslice(w,tslice[1],sliceidx), xlabel = 't = ' + str(times[1]))
+myplot(ax9, xy0, xy1, zslice(w,tslice[2],sliceidx), xlabel = 't = ' + str(times[2]))
+plt.savefig('xy-slices.png')
+
+#    yz slices at various times
+plt.figure(2)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, yz0, yz1, xslice(u,tslice[0],sliceidx), ylabel = 'u')
+myplot(ax2, yz0, yz1, xslice(u,tslice[1],sliceidx))
+myplot(ax3, yz0, yz1, xslice(u,tslice[2],sliceidx))
+myplot(ax4, yz0, yz1, xslice(v,tslice[0],sliceidx), ylabel = 'v')
+myplot(ax5, yz0, yz1, xslice(v,tslice[1],sliceidx))
+myplot(ax6, yz0, yz1, xslice(v,tslice[2],sliceidx))
+myplot(ax7, yz0, yz1, xslice(w,tslice[0],sliceidx), ylabel = 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, yz0, yz1, xslice(w,tslice[1],sliceidx), xlabel = 't = ' + str(times[1]))
+myplot(ax9, yz0, yz1, xslice(w,tslice[2],sliceidx), xlabel = 't = ' + str(times[2]))
+plt.savefig('yz-slices.png')
+
+#    xz slices at various times
+plt.figure(3)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, xz0, xz1, yslice(u,tslice[0],sliceidx), ylabel ='u')
+myplot(ax2, xz0, xz1, yslice(u,tslice[1],sliceidx))
+myplot(ax3, xz0, xz1, yslice(u,tslice[2],sliceidx))
+myplot(ax4, xz0, xz1, yslice(v,tslice[0],sliceidx), ylabel = 'v')
+myplot(ax5, xz0, xz1, yslice(v,tslice[1],sliceidx))
+myplot(ax6, xz0, xz1, yslice(v,tslice[2],sliceidx))
+myplot(ax7, xz0, xz1, yslice(w,tslice[0],sliceidx), ylabel= 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, xz0, xz1, yslice(w,tslice[1],sliceidx), xlabel ='t = ' + str(times[1]))
+myplot(ax9, xz0, xz1, yslice(w,tslice[2],sliceidx), xlabel = 't = ' + str(times[2]))
+plt.savefig('xz-slices.png')
+
+#    xy projection at various times
+plt.figure(4)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, xy0, xy1, zproj(u,tslice[0]), ylabel = 'u')
+myplot(ax2, xy0, xy1, zproj(u,tslice[1]))
+myplot(ax3, xy0, xy1, zproj(u,tslice[2]))
+myplot(ax4, xy0, xy1, zproj(v,tslice[0]), ylabel = 'v')
+myplot(ax5, xy0, xy1, zproj(v,tslice[1]))
+myplot(ax6, xy0, xy1, zproj(v,tslice[2]))
+myplot(ax7, xy0, xy1, zproj(w,tslice[0]), ylabel = 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, xy0, xy1, zproj(w,tslice[1]), xlabel = 't = ' + str(times[1]))
+myplot(ax9, xy0, xy1, zproj(w,tslice[2]), xlabel = 't = ' + str(times[2]))
+plt.savefig('xy-projections.png')
+
+#    yz projection at various times
+fig = plt.figure(5)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, yz0, yz1, xproj(u,tslice[0]), ylabel = 'u')
+myplot(ax2, yz0, yz1, xproj(u,tslice[1]))
+myplot(ax3, yz0, yz1, xproj(u,tslice[2]))
+myplot(ax4, yz0, yz1, xproj(v,tslice[0]), ylabel = 'v')
+myplot(ax5, yz0, yz1, xproj(v,tslice[1]))
+myplot(ax6, yz0, yz1, xproj(v,tslice[2]))
+myplot(ax7, yz0, yz1, xproj(w,tslice[0]), ylabel = 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, yz0, yz1, xproj(w,tslice[1]), xlabel = 't = ' + str(times[1]))
+myplot(ax9, yz0, yz1, xproj(w,tslice[2]), xlabel = 't = ' + str(times[2]))
+plt.savefig('yz-projections.png')
+
+#    xz projection at various times
+fig = plt.figure(6)
+fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize)
+myplot(ax1, xz0, xz1, yproj(u,tslice[0]), ylabel = 'u')
+myplot(ax2, xz0, xz1, yproj(u,tslice[1]))
+myplot(ax3, xz0, xz1, yproj(u,tslice[2]))
+myplot(ax4, xz0, xz1, yproj(v,tslice[0]), ylabel = 'v')
+myplot(ax5, xz0, xz1, yproj(v,tslice[1]))
+myplot(ax6, xz0, xz1, yproj(v,tslice[2]))
+myplot(ax7, xz0, xz1, yproj(w,tslice[0]), ylabel = 'w', xlabel = 't = ' + str(times[0]))
+myplot(ax8, xz0, xz1, yproj(w,tslice[1]), xlabel = 't = ' + str(times[1]))
+myplot(ax9, xz0, xz1, yproj(w,tslice[2]), xlabel = 't = ' + str(times[2]))
+plt.savefig('xz-projections.png')
+
+#plt.show()
+plt.close()
+
+##### end of script #####
diff --git a/cmake/SundialsTPLOptions.cmake b/cmake/SundialsTPLOptions.cmake
index f01a0ac14d..11e39d0f99 100644
--- a/cmake/SundialsTPLOptions.cmake
+++ b/cmake/SundialsTPLOptions.cmake
@@ -61,6 +61,11 @@ sundials_option(ENABLE_HIP BOOL "Enable HIP support" OFF)
 # -------------------------------------------------------------
 sundials_option(ENABLE_SYCL BOOL "Enable SYCL support" OFF)
 
+sundials_option(SUNDIALS_SYCL_2020_UNSUPPORTED BOOL
+                "Disable the use of some SYCL 2020 features in SUNDIALS libraries and examples" OFF
+                DEPENDS_ON ENABLE_SYCL
+                ADVANCED)
+
 # ---------------------------------------------------------------
 # Enable LAPACK support?
 # ---------------------------------------------------------------
@@ -288,6 +293,16 @@ sundials_option(ONEMKL_WORKS BOOL "Set to ON to force CMake to accept a given on
                 DEPENDS_ON ENABLE_ONEMKL
                 ADVANCED)
 
+sundials_option(SUNDIALS_ONEMKL_USE_GETRF_LOOP BOOL
+                "Replace batched getrf call with loop over getrf" OFF
+                DEPENDS_ON ENABLE_ONEMKL
+                ADVANCED)
+
+sundials_option(SUNDIALS_ONEMKL_USE_GETRS_LOOP BOOL
+                "Replace batched getrs call with loop over getrs" OFF
+                DEPENDS_ON ENABLE_ONEMKL
+                ADVANCED)
+
 # ---------------------------------------------------------------
 # Enable Caliper support?
 # ---------------------------------------------------------------
diff --git a/cmake/tpl/SundialsONEMKL.cmake b/cmake/tpl/SundialsONEMKL.cmake
index 34177ff0fe..a807a2e7f4 100644
--- a/cmake/tpl/SundialsONEMKL.cmake
+++ b/cmake/tpl/SundialsONEMKL.cmake
@@ -64,6 +64,7 @@ find_package(MKL CONFIG
              NO_DEFAULT_PATH
              REQUIRED)
 
+message(STATUS "MKL Version: ${MKL_VERSION}")
 message(STATUS "MKL Targets: ${MKL_IMPORTED_TARGETS}")
 
 # -----------------------------------------------------------------------------
diff --git a/doc/arkode/guide/source/Butcher.rst b/doc/arkode/guide/source/Butcher.rst
index 5bfdd6169c..6666a4f7dd 100644
--- a/doc/arkode/guide/source/Butcher.rst
+++ b/doc/arkode/guide/source/Butcher.rst
@@ -180,6 +180,41 @@ This is the default 2nd order explicit method.
    region is outlined in blue; the embedding's region is in red.
 
 
+.. _Butcher.ARK2_ERK:
+
+ARK2-ERK-3-1-2
+^^^^^^^^^^^^^^
+
+.. index:: ARK2-ERK-3-1-2
+
+Accessible via the constant ``ARKODE_ARK2_ERK_3_1_2`` to
+:c:func:`ARKStepSetTableNum()`, :c:func:`ERKStepSetTableNum()` or
+:c:func:`ARKodeButcherTable_LoadERK()`.
+Accessible via the string ``"ARKODE_ARK2_ERK_3_1_2"`` to
+:c:func:`ARKStepSetTableName()`, :c:func:`ERKStepSetTableName()` or
+:c:func:`ARKodeButcherTable_LoadERKByName()`.
+This is the explicit portion of the default 2nd order additive method (the
+explicit portion of the ARK2 method from :cite:p:`giraldo2013implicit`).
+
+.. math::
+
+   \renewcommand{\arraystretch}{1.5}
+   \begin{array}{r|ccc}
+     0            & 0                           & 0                       & 0 \\
+     2 - \sqrt{2} & 2 - \sqrt{2}                & 0                       & 0 \\
+     1            & 1 - \frac{3 + 2\sqrt{2}}{6} & \frac{3 + 2\sqrt{2}}{6} & 0 \\
+     \hline
+     2 & \frac{1}{2\sqrt{2}}    & \frac{1}{2\sqrt{2}}    & 1 - \frac{1}{\sqrt{2}} \\
+     1 & \frac{4 - \sqrt{2}}{8} & \frac{4 - \sqrt{2}}{8} & \frac{1}{2\sqrt{2}}    \\
+   \end{array}
+
+.. figure:: /figs/arkode/ark2_erk_stab_region.png
+   :scale: 65 %
+   :align: center
+
+   Linear stability region for the ARK2-ERK method. The method's
+   region is outlined in blue; the embedding's region is in red.
+
 
 .. _Butcher.Bogacki_Shampine:
 
@@ -816,6 +851,41 @@ are A- and B-stable.
    region is outlined in blue; the embedding's region is in red.
 
 
+.. _Butcher.ARK2_DIRK:
+
+ARK2-DIRK-3-1-2
+^^^^^^^^^^^^^^^
+
+.. index:: ARK2-DIRK-3-1-2
+
+Accessible via the constant ``ARKODE_ARK2_DIRK_3_1_2`` to
+:c:func:`ARKStepSetTableNum()`, or
+:c:func:`ARKodeButcherTable_LoadDIRK()`.
+Accessible via the string ``"ARKODE_ARK2_DIRK_3_1_2"`` to
+:c:func:`ARKStepSetTableName()`, or
+:c:func:`ARKodeButcherTable_LoadDIRKByName()`.
+This is the implicit portion of the default 2nd order additive method (the
+implicit portion of the ARK2 method from :cite:p:`giraldo2013implicit`).
+
+.. math::
+
+   \renewcommand{\arraystretch}{1.5}
+   \begin{array}{r|ccc}
+     0            & 0                      & 0                      & 0 \\
+     2 - \sqrt{2} & 1 - \frac{1}{\sqrt{2}} & 1 - \frac{1}{\sqrt{2}} & 0 \\
+     1            & \frac{1}{2\sqrt{2}}    & \frac{1}{2\sqrt{2}}    & 1 - \frac{1}{\sqrt{2}} \\
+     \hline
+     2 & \frac{1}{2\sqrt{2}}    & \frac{1}{2\sqrt{2}}    & 1 - \frac{1}{\sqrt{2}} \\
+     1 & \frac{4 - \sqrt{2}}{8} & \frac{4 - \sqrt{2}}{8} & \frac{1}{2\sqrt{2}}    \\
+   \end{array}
+
+.. figure:: /figs/arkode/ark2_dirk_stab_region.png
+   :scale: 65 %
+   :align: center
+
+   Linear stability region for the ARK2-DIRK method. The method's
+   region is outlined in blue; the embedding's region is in red.
+
 
 .. _Butcher.Billington:
 
@@ -1590,10 +1660,16 @@ Additive Butcher tables
 ---------------------------
 
 In the category of additive Runge--Kutta methods for split implicit and
-explicit calculations, ARKODE includes methods that have orders 3
-through 5, with embeddings that are of orders 2 through 4.  These
+explicit calculations, ARKODE includes methods that have orders 2
+through 5, with embeddings that are of orders 1 through 4.  These
 Butcher table pairs are as follows:
 
+* :index:`2nd-order pair <ARK-3-1-2 ARK method>`:
+  :numref:`Butcher.ARK2_ERK` with :numref:`Butcher.ARK2_DIRK`,
+  corresponding to Butcher tables ``ARKODE_ARK2_ERK_3_1_2`` and
+  ``ARKODE_ARK2_DIRK_3_1_2`` for :c:func:`ARKStepSetTableNum()`
+  or :c:func:`ARKStepSetTableName()`.
+
 * :index:`3rd-order pair <ARK-4-2-3 ARK method>`:
   :numref:`Butcher.ARK_4_2_3_E` with :numref:`Butcher.ARK_4_2_3_I`,
   corresponding to Butcher tables ``ARKODE_ARK324L2SA_ERK_4_2_3`` and
diff --git a/doc/arkode/guide/source/Introduction.rst b/doc/arkode/guide/source/Introduction.rst
index 10174ccd8c..4906c5814f 100644
--- a/doc/arkode/guide/source/Introduction.rst
+++ b/doc/arkode/guide/source/Introduction.rst
@@ -118,6 +118,25 @@ provided with SUNDIALS, or again may utilize a user-supplied module.
 Changes from previous versions
 ==============================
 
+Changes in v5.6.0
+-----------------
+
+Added the second order IMEX method from :cite:p:`giraldo2013implicit` as the
+default second order IMEX method in ARKStep. The explicit table is given by
+``ARKODE_ARK2_ERK_3_1_2`` (see :numref:`Butcher.ARK2_ERK`) and the implicit
+table by ``ARKODE_ARK2_DIRK_3_1_2`` (see :numref:`Butcher.ARK2_DIRK`).
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
+Updated the default ARKODE behavior when returning the solution when
+the internal time has reached a user-specified stop time.  Previously, the output
+solution was interpolated to the value of ``tstop``; the default is now to copy the
+internal solution vector.  Users who wish to revert to interpolation may call a new
+routine :c:func:`ARKStepSetInterpolateStopTime`,
+:c:func:`ERKStepSetInterpolateStopTime`, or :c:func:`MRIStepSetInterpolateStopTime`.
+
 Changes in v5.5.1
 -----------------
 
diff --git a/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst b/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst
index cd4acec555..4713160b05 100644
--- a/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst
+++ b/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst
@@ -876,6 +876,7 @@ Maximum no. of internal steps before *tout*       :c:func:`ARKStepSetMaxNumSteps
 Maximum absolute step size                        :c:func:`ARKStepSetMaxStep`              :math:`\infty`
 Minimum absolute step size                        :c:func:`ARKStepSetMinStep`              0.0
 Set a value for :math:`t_{stop}`                  :c:func:`ARKStepSetStopTime`             undefined
+Interpolate at :math:`t_{stop}`                   :c:func:`ARKStepSetInterpolateStopTime`  ``SUNFALSE``
 Disable the stop time                             :c:func:`ARKStepClearStopTime`           N/A
 Supply a pointer for user data                    :c:func:`ARKStepSetUserData`             ``NULL``
 Maximum no. of ARKStep error test failures        :c:func:`ARKStepSetMaxErrTestFails`      7
@@ -1267,6 +1268,23 @@ Set max number of constraint failures             :c:func:`ARKStepSetMaxNumConst
       :c:func:`ARKStepClearStopTime`.
 
 
+.. c:function:: int ARKStepSetInterpolateStopTime(void* arkode_mem, booleantype interp)
+
+   Specifies that the output solution should be interpolated when the current
+   :math:`t` equals the specified ``tstop`` (instead of merely copying the
+   internal solution :math:`y_n`).
+
+   **Arguments:**
+      * *arkode_mem* -- pointer to the ARKStep memory block.
+      * *interp* -- flag indicating to use interpolation (1) or copy (0).
+
+   **Return value:**
+      * *ARK_SUCCESS* if successful
+      * *ARK_MEM_NULL* if the ARKStep memory is ``NULL``
+
+   .. versionadded:: 5.6.0
+
+
 .. c:function:: int ARKStepClearStopTime(void* arkode_mem)
 
    Disables the stop time set with :c:func:`ARKStepSetStopTime`.
@@ -1454,7 +1472,7 @@ Set additive RK tables via their names    :c:func:`ARKStepSetTableName()`    int
       For explicit methods, the allowed values are :math:`2 \le`
       *ord* :math:`\le 8`.  For implicit methods, the allowed values are
       :math:`2\le` *ord* :math:`\le 5`, and for ImEx methods the allowed
-      values are :math:`3 \le` *ord* :math:`\le 5`.  Any illegal input
+      values are :math:`2 \le` *ord* :math:`\le 5`.  Any illegal input
       will result in the default value of 4.
 
       Since *ord* affects the memory requirements for the internal
diff --git a/doc/arkode/guide/source/Usage/ERKStep_c_interface/User_callable.rst b/doc/arkode/guide/source/Usage/ERKStep_c_interface/User_callable.rst
index 2a3702da40..b40ddbf160 100644
--- a/doc/arkode/guide/source/Usage/ERKStep_c_interface/User_callable.rst
+++ b/doc/arkode/guide/source/Usage/ERKStep_c_interface/User_callable.rst
@@ -467,45 +467,47 @@ Optional inputs for ERKStep
 .. _ARKODE.Usage.ERKStep.ERKStepInputTable:
 .. table:: Optional inputs for ERKStep
 
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Optional input                                     | Function name                           |  Default               |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Return ERKStep solver parameters to their defaults | :c:func:`ERKStepSetDefaults()`          |  internal              |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Set dense output interpolation type                | :c:func:`ERKStepSetInterpolantType()`   | ``ARK_INTERP_HERMITE`` |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Set dense output polynomial degree                 | :c:func:`ERKStepSetInterpolantDegree()` |  5                     |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Supply a pointer to a diagnostics output file      | :c:func:`ERKStepSetDiagnostics()`       | ``NULL``               |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Supply a pointer to an error output file           | :c:func:`ERKStepSetErrFile()`           | ``stderr``             |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Supply a custom error handler function             | :c:func:`ERKStepSetErrHandlerFn()`      |  internal fn           |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Disable time step adaptivity (fixed-step mode)     | :c:func:`ERKStepSetFixedStep()`         |  disabled              |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Supply an initial step size to attempt             | :c:func:`ERKStepSetInitStep()`          |  estimated             |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Maximum no. of warnings for :math:`t_n+h = t_n`    | :c:func:`ERKStepSetMaxHnilWarns()`      |  10                    |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Maximum no. of internal steps before *tout*        | :c:func:`ERKStepSetMaxNumSteps()`       |  500                   |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Maximum absolute step size                         | :c:func:`ERKStepSetMaxStep()`           | :math:`\infty`         |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Minimum absolute step size                         | :c:func:`ERKStepSetMinStep()`           |  0.0                   |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Set a value for :math:`t_{stop}`                   | :c:func:`ERKStepSetStopTime()`          | undefined              |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Disable the stop time                              | :c:func:`ERKStepClearStopTime`          | N/A                    |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Supply a pointer for user data                     | :c:func:`ERKStepSetUserData()`          | ``NULL``               |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Maximum no. of ERKStep error test failures         | :c:func:`ERKStepSetMaxErrTestFails()`   |  7                     |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Set inequality constraints on solution             | :c:func:`ERKStepSetConstraints()`       | ``NULL``               |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
-   | Set max number of constraint failures              | :c:func:`ERKStepSetMaxNumConstrFails()` |  10                    |
-   +----------------------------------------------------+-----------------------------------------+------------------------+
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Optional input                                     | Function name                             |  Default               |
+   +====================================================+===========================================+========================+
+   | Return ERKStep solver parameters to their defaults | :c:func:`ERKStepSetDefaults()`            |  internal              |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Set dense output interpolation type                | :c:func:`ERKStepSetInterpolantType()`     | ``ARK_INTERP_HERMITE`` |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Set dense output polynomial degree                 | :c:func:`ERKStepSetInterpolantDegree()`   |  5                     |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Supply a pointer to a diagnostics output file      | :c:func:`ERKStepSetDiagnostics()`         | ``NULL``               |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Supply a pointer to an error output file           | :c:func:`ERKStepSetErrFile()`             | ``stderr``             |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Supply a custom error handler function             | :c:func:`ERKStepSetErrHandlerFn()`        |  internal fn           |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Disable time step adaptivity (fixed-step mode)     | :c:func:`ERKStepSetFixedStep()`           |  disabled              |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Supply an initial step size to attempt             | :c:func:`ERKStepSetInitStep()`            |  estimated             |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Maximum no. of warnings for :math:`t_n+h = t_n`    | :c:func:`ERKStepSetMaxHnilWarns()`        |  10                    |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Maximum no. of internal steps before *tout*        | :c:func:`ERKStepSetMaxNumSteps()`         |  500                   |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Maximum absolute step size                         | :c:func:`ERKStepSetMaxStep()`             | :math:`\infty`         |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Minimum absolute step size                         | :c:func:`ERKStepSetMinStep()`             |  0.0                   |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Set a value for :math:`t_{stop}`                   | :c:func:`ERKStepSetStopTime()`            | undefined              |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Interpolate at :math:`t_{stop}`                    | :c:func:`ERKStepInterpolateSetStopTime()` | ``SUNFALSE``           |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Disable the stop time                              | :c:func:`ERKStepClearStopTime`            | N/A                    |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Supply a pointer for user data                     | :c:func:`ERKStepSetUserData()`            | ``NULL``               |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Maximum no. of ERKStep error test failures         | :c:func:`ERKStepSetMaxErrTestFails()`     |  7                     |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Set inequality constraints on solution             | :c:func:`ERKStepSetConstraints()`         | ``NULL``               |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
+   | Set max number of constraint failures              | :c:func:`ERKStepSetMaxNumConstrFails()`   |  10                    |
+   +----------------------------------------------------+-------------------------------------------+------------------------+
 
 
 
@@ -886,6 +888,23 @@ Optional inputs for ERKStep
       :c:func:`ERKStepClearStopTime`.
 
 
+.. c:function:: int ERKStepSetInterpolateStopTime(void* arkode_mem, booleantype interp)
+
+   Specifies that the output solution should be interpolated when the current
+   :math:`t` equals the specified ``tstop`` (instead of merely copying the
+   internal solution :math:`y_n`).
+
+   **Arguments:**
+      * *arkode_mem* -- pointer to the ERKStep memory block.
+      * *interp* -- flag indicating to use interpolation (1) or copy (0).
+
+   **Return value:**
+      * *ARK_SUCCESS* if successful
+      * *ARK_MEM_NULL* if the ARKStep memory is ``NULL``
+
+   .. versionadded:: 5.6.0
+
+
 .. c:function:: int ERKStepClearStopTime(void* arkode_mem)
 
    Disables the stop time set with :c:func:`ERKStepSetStopTime`.
diff --git a/doc/arkode/guide/source/Usage/MRIStep_c_interface/User_callable.rst b/doc/arkode/guide/source/Usage/MRIStep_c_interface/User_callable.rst
index b177f456ce..743b9c66e2 100644
--- a/doc/arkode/guide/source/Usage/MRIStep_c_interface/User_callable.rst
+++ b/doc/arkode/guide/source/Usage/MRIStep_c_interface/User_callable.rst
@@ -672,37 +672,39 @@ Optional inputs for MRIStep
 .. _ARKODE.Usage.MRIStep.MRIStepInput.Table:
 .. table:: Optional inputs for MRIStep
 
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Optional input                                                | Function name                           | Default                |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Return MRIStep solver parameters to their defaults            | :c:func:`MRIStepSetDefaults()`          | internal               |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Set dense output interpolation type                           | :c:func:`MRIStepSetInterpolantType()`   | ``ARK_INTERP_HERMITE`` |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Set dense output polynomial degree                            | :c:func:`MRIStepSetInterpolantDegree()` | 5                      |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Supply a pointer to a diagnostics output file                 | :c:func:`MRIStepSetDiagnostics()`       | ``NULL``               |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Supply a pointer to an error output file                      | :c:func:`MRIStepSetErrFile()`           | ``stderr``             |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Supply a custom error handler function                        | :c:func:`MRIStepSetErrHandlerFn()`      | internal fn            |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Run with fixed-step sizes                                     | :c:func:`MRIStepSetFixedStep()`         | required               |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Maximum no. of warnings for :math:`t_n+h = t_n`               | :c:func:`MRIStepSetMaxHnilWarns()`      | 10                     |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Maximum no. of internal steps before *tout*                   | :c:func:`MRIStepSetMaxNumSteps()`       | 500                    |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Set a value for :math:`t_{stop}`                              | :c:func:`MRIStepSetStopTime()`          | undefined              |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Disable the stop time                                         | :c:func:`MRIStepClearStopTime`          | N/A                    |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Supply a pointer for user data                                | :c:func:`MRIStepSetUserData()`          | ``NULL``               |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Supply a function to be called prior to the inner integration | :c:func:`MRIStepSetPreInnerFn()`        | ``NULL``               |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
-   | Supply a function to be called after the inner integration    | :c:func:`MRIStepSetPostInnerFn()`       | ``NULL``               |
-   +---------------------------------------------------------------+-----------------------------------------+------------------------+
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Optional input                                                | Function name                             | Default                |
+   +===============================================================+===========================================+========================+
+   | Return MRIStep solver parameters to their defaults            | :c:func:`MRIStepSetDefaults()`            | internal               |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Set dense output interpolation type                           | :c:func:`MRIStepSetInterpolantType()`     | ``ARK_INTERP_HERMITE`` |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Set dense output polynomial degree                            | :c:func:`MRIStepSetInterpolantDegree()`   | 5                      |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Supply a pointer to a diagnostics output file                 | :c:func:`MRIStepSetDiagnostics()`         | ``NULL``               |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Supply a pointer to an error output file                      | :c:func:`MRIStepSetErrFile()`             | ``stderr``             |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Supply a custom error handler function                        | :c:func:`MRIStepSetErrHandlerFn()`        | internal fn            |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Run with fixed-step sizes                                     | :c:func:`MRIStepSetFixedStep()`           | required               |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Maximum no. of warnings for :math:`t_n+h = t_n`               | :c:func:`MRIStepSetMaxHnilWarns()`        | 10                     |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Maximum no. of internal steps before *tout*                   | :c:func:`MRIStepSetMaxNumSteps()`         | 500                    |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Set a value for :math:`t_{stop}`                              | :c:func:`MRIStepSetStopTime()`            | undefined              |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Interpolate at :math:`t_{stop}`                               | :c:func:`MRIStepSetInterpolateStopTime()` | ``SUNFALSE``           |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Disable the stop time                                         | :c:func:`MRIStepClearStopTime`            | N/A                    |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Supply a pointer for user data                                | :c:func:`MRIStepSetUserData()`            | ``NULL``               |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Supply a function to be called prior to the inner integration | :c:func:`MRIStepSetPreInnerFn()`          | ``NULL``               |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
+   | Supply a function to be called after the inner integration    | :c:func:`MRIStepSetPostInnerFn()`         | ``NULL``               |
+   +---------------------------------------------------------------+-------------------------------------------+------------------------+
 
 
 
@@ -1105,6 +1107,23 @@ Optional inputs for MRIStep
       :c:func:`MRIStepClearStopTime`.
 
 
+.. c:function:: int MRIStepSetInterpolateStopTime(void* arkode_mem, booleantype interp)
+
+   Specifies that the output solution should be interpolated when the current
+   :math:`t` equals the specified ``tstop`` (instead of merely copying the
+   internal solution :math:`y_n`).
+
+   **Arguments:**
+      * *arkode_mem* -- pointer to the MRIStep memory block.
+      * *interp* -- flag indicating to use interpolation (1) or copy (0).
+
+   **Return value:**
+      * *ARK_SUCCESS* if successful
+      * *ARK_MEM_NULL* if the ARKStep memory is ``NULL``
+
+   .. versionadded:: 5.6.0
+
+
 .. c:function:: int MRIStepClearStopTime(void* arkode_mem)
 
    Disables the stop time set with :c:func:`MRIStepSetStopTime`.
diff --git a/doc/cvode/guide/source/Introduction.rst b/doc/cvode/guide/source/Introduction.rst
index 496dfa5be6..5ecd6bc23d 100644
--- a/doc/cvode/guide/source/Introduction.rst
+++ b/doc/cvode/guide/source/Introduction.rst
@@ -111,6 +111,19 @@ implementations.
 Changes from previous versions
 ==============================
 
+Changes in v6.6.0
+-----------------
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
+Updated the default CVODE behavior when returning the solution when
+the internal time has reached a user-specified stop time.  Previously, the output
+solution was interpolated to the value of ``tstop``; the default is now to copy the
+internal solution vector.  Users who wish to revert to interpolation may call the
+routine :c:func:`CVodeSetInterpolateStopTime`.
+
 Changes in v6.5.1
 -----------------
 
@@ -145,7 +158,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w
 as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``,
 ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions.
 Compiling and running code that uses the SUNDIALS Fortran interfaces with
-bounds checking will now work. 
+bounds checking will now work.
 
 Changes in v6.4.1
 -----------------
diff --git a/doc/cvode/guide/source/Usage/index.rst b/doc/cvode/guide/source/Usage/index.rst
index 0f425c4b96..1bfd789b52 100644
--- a/doc/cvode/guide/source/Usage/index.rst
+++ b/doc/cvode/guide/source/Usage/index.rst
@@ -713,7 +713,7 @@ of two modes as to where CVODE is to return a solution. But these
 modes are modified if the user has set a stop time (with :c:func:`CVodeSetStopTime`) or requested
 rootfinding.
 
-.. c:function:: int CVode(void* cvode_mem, realtype tout, N_Vector yout, realtype tret, int itask)
+.. c:function:: int CVode(void* cvode_mem, realtype tout, N_Vector yout, realtype* tret, int itask)
 
    The function ``CVode`` integrates the ODE over an interval in t.
 
@@ -852,6 +852,9 @@ Main solver optional input functions
    +-------------------------------+---------------------------------------------+----------------+
    | Value of :math:`t_{stop}`     | :c:func:`CVodeSetStopTime`                  | undefined      |
    +-------------------------------+---------------------------------------------+----------------+
+   | Interpolate at                | :c:func:`CVodeSetInterpolateStopTime`       | ``SUNFALSE``   |
+   | :math:`t_{stop}`              |                                             |                |
+   +-------------------------------+---------------------------------------------+----------------+
    | Disable the stop time         | :c:func:`CVodeClearStopTime`                | N/A            |
    +-------------------------------+---------------------------------------------+----------------+
    | Maximum no. of error test     | :c:func:`CVodeSetMaxErrTestFails`           | 7              |
@@ -1091,6 +1094,22 @@ Main solver optional input functions
       A stop time not reached before a call to :c:func:`CVodeReInit` will
       remain active but can be disabled by calling :c:func:`CVodeClearStopTime`.
 
+.. c:function:: int CVodeSetInterpolateStopTime(void* cvode_mem, booleantype interp)
+
+   The function ``CVodeSetInterpolateStopTime`` specifies that the output solution should be
+   interpolated when the current :math:`t` equals the specified ``tstop`` (instead of
+   merely copying the internal solution :math:`y_n`).
+
+   **Arguments:**
+     * ``cvode_mem`` -- pointer to the CVODES memory block.
+     * ``interp`` -- flag indicating to use interpolation (1) or copy (0).
+
+   **Return value:**
+     * ``CV_SUCCESS`` -- The optional value has been successfully set.
+     * ``CV_MEM_NULL`` -- The CVODES memory block was not initialized through a previous call to :c:func:`CVodeCreate`.
+
+   .. versionadded:: 6.6.0
+
 .. c:function:: int CVodeClearStopTime(void* cvode_mem)
 
    Disables the stop time set with :c:func:`CVodeSetStopTime`.
@@ -1630,8 +1649,8 @@ the :c:func:`CVodeSetEpsLin` function.
 
 .. _CVODE.Usage.CC.optional_input.optin_nls:
 
-Linear solver interface optional input functions
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Nonlinear solver interface optional input functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. _CVODE.Usage.CC.optional_input.optin_nls_table:
 
diff --git a/doc/cvodes/guide/source/Introduction.rst b/doc/cvodes/guide/source/Introduction.rst
index aec32d3649..2795919253 100644
--- a/doc/cvodes/guide/source/Introduction.rst
+++ b/doc/cvodes/guide/source/Introduction.rst
@@ -111,6 +111,19 @@ Fortran.
 Changes from previous versions
 ==============================
 
+Changes in v6.6.0
+-----------------
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
+Updated the default CVODES behavior when returning the solution when
+the internal time has reached a user-specified stop time.  Previously, the output
+solution was interpolated to the value of ``tstop``; the default is now to copy the
+internal solution vector.  Users who wish to revert to interpolation may call the
+routine :c:func:`CVodeSetInterpolateStopTime`.
+
 Changes in v6.5.1
 -----------------
 
@@ -146,7 +159,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w
 as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``,
 ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions.
 Compiling and running code that uses the SUNDIALS Fortran interfaces with
-bounds checking will now work. 
+bounds checking will now work.
 
 Changes in v6.4.1
 -----------------
diff --git a/doc/cvodes/guide/source/Usage/ADJ.rst b/doc/cvodes/guide/source/Usage/ADJ.rst
index a6431e8882..8552d514f3 100644
--- a/doc/cvodes/guide/source/Usage/ADJ.rst
+++ b/doc/cvodes/guide/source/Usage/ADJ.rst
@@ -383,7 +383,7 @@ use in Forward Sensitivity Analysis; for that, see :numref:`CVODES.Usage.FSA`.
 The call to this function has the form
 
 
-.. c:function:: int CVodeF(void * cvode_mem, realtype tout, N_Vector yret, realtype tret, int itask, int ncheck)
+.. c:function:: int CVodeF(void * cvode_mem, realtype tout, N_Vector yret, realtype* tret, int itask, int ncheck)
 
    The function :c:func:`CVodeF` integrates the forward problem over an interval
    in :math:`t`  and saves checkpointing data.
@@ -1353,7 +1353,7 @@ To extract the values of the quadrature variables at the last return time of
 :c:func:`CVodeGetQuad`.
 
 
-.. c:function:: int CVodeGetQuadB(void * cvode_mem, whichrealtype tret, N_Vector yQB)
+.. c:function:: int CVodeGetQuadB(void * cvode_mem, int which, realtype* tret, N_Vector yQB)
 
    The function :c:func:`CVodeGetQuadB` returns the quadrature solution vector
    after  a successful return from :c:func:`CVodeB`.
diff --git a/doc/cvodes/guide/source/Usage/SIM.rst b/doc/cvodes/guide/source/Usage/SIM.rst
index 826ee20e29..dbc6506dd1 100644
--- a/doc/cvodes/guide/source/Usage/SIM.rst
+++ b/doc/cvodes/guide/source/Usage/SIM.rst
@@ -721,7 +721,7 @@ of two modes as to where CVODES is to return a solution. But these
 modes are modified if the user has set a stop time (with :c:func:`CVodeSetStopTime`) or requested
 rootfinding.
 
-.. c:function:: int CVode(void* cvode_mem, realtype tout, N_Vector yout, realtype tret, int itask)
+.. c:function:: int CVode(void* cvode_mem, realtype tout, N_Vector yout, realtype* tret, int itask)
 
    The function ``CVode`` integrates the ODE over an interval in t.
 
@@ -829,45 +829,47 @@ Main solver optional input functions
 
 .. table:: Optional inputs for CVODES
 
-   +-------------------------------+---------------------------------------------+----------------+
-   |      **Optional input**       |              **Function name**              |  **Default**   |
-   +===============================+=============================================+================+
-   | Pointer to an error file      | :c:func:`CVodeSetErrFile`                   | ``stderr``     |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Error handler function        | :c:func:`CVodeSetErrHandlerFn`              | internal fn.   |
-   +-------------------------------+---------------------------------------------+----------------+
-   | User data                     | :c:func:`CVodeSetUserData`                  | ``NULL``       |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Maximum order for BDF method  | :c:func:`CVodeSetMaxOrd`                    | 5              |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Maximum order for Adams       | :c:func:`CVodeSetMaxOrd`                    | 12             |
-   | method                        |                                             |                |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Maximum no. of internal steps | :c:func:`CVodeSetMaxNumSteps`               | 500            |
-   | before :math:`t_{out}`        |                                             |                |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Maximum no. of warnings for   | :c:func:`CVodeSetMaxHnilWarns`              | 10             |
-   | :math:`t_n+h=t_n`             |                                             |                |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Flag to activate stability    | :c:func:`CVodeSetStabLimDet`                | ``SUNFALSE``   |
-   | limit detection               |                                             |                |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Initial step size             | :c:func:`CVodeSetInitStep`                  | estimated      |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Minimum absolute step size    | :c:func:`CVodeSetMinStep`                   | 0.0            |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Maximum absolute step size    | :c:func:`CVodeSetMaxStep`                   | :math:`\infty` |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Value of :math:`t_{stop}`     | :c:func:`CVodeSetStopTime`                  | undefined      |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Disable the stop time         | :c:func:`CVodeClearStopTime`                | N/A            |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Maximum no. of error test     | :c:func:`CVodeSetMaxErrTestFails`           | 7              |
-   | failures                      |                                             |                |
-   +-------------------------------+---------------------------------------------+----------------+
-   | Inequality constraints on     | :c:func:`CVodeSetConstraints`               |                |
-   | solution                      |                                             |                |
-   +-------------------------------+---------------------------------------------+----------------+
+   +---------------------------------+---------------------------------------------+----------------+
+   |        **Optional input**       |              **Function name**              |  **Default**   |
+   +=================================+=============================================+================+
+   | Pointer to an error file        | :c:func:`CVodeSetErrFile`                   | ``stderr``     |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Error handler function          | :c:func:`CVodeSetErrHandlerFn`              | internal fn.   |
+   +---------------------------------+---------------------------------------------+----------------+
+   | User data                       | :c:func:`CVodeSetUserData`                  | ``NULL``       |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Maximum order for BDF method    | :c:func:`CVodeSetMaxOrd`                    | 5              |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Maximum order for Adams         | :c:func:`CVodeSetMaxOrd`                    | 12             |
+   | method                          |                                             |                |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Maximum no. of internal steps   | :c:func:`CVodeSetMaxNumSteps`               | 500            |
+   | before :math:`t_{out}`          |                                             |                |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Maximum no. of warnings for     | :c:func:`CVodeSetMaxHnilWarns`              | 10             |
+   | :math:`t_n+h=t_n`               |                                             |                |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Flag to activate stability      | :c:func:`CVodeSetStabLimDet`                | ``SUNFALSE``   |
+   | limit detection                 |                                             |                |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Initial step size               | :c:func:`CVodeSetInitStep`                  | estimated      |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Minimum absolute step size      | :c:func:`CVodeSetMinStep`                   | 0.0            |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Maximum absolute step size      | :c:func:`CVodeSetMaxStep`                   | :math:`\infty` |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Value of :math:`t_{stop}`       | :c:func:`CVodeSetStopTime`                  | undefined      |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Interpolate at :math:`t_{stop}` | :c:func:`CVodeSetInterpolateStopTime`       | ``SUNFALSE``   |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Disable the stop time           | :c:func:`CVodeClearStopTime`                | N/A            |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Maximum no. of error test       | :c:func:`CVodeSetMaxErrTestFails`           | 7              |
+   | failures                        |                                             |                |
+   +---------------------------------+---------------------------------------------+----------------+
+   | Inequality constraints on       | :c:func:`CVodeSetConstraints`               |                |
+   | solution                        |                                             |                |
+   +---------------------------------+---------------------------------------------+----------------+
 
 
 .. c:function:: int CVodeSetErrFile(void* cvode_mem, FILE * errfp)
@@ -1096,6 +1098,22 @@ Main solver optional input functions
       A stop time not reached before a call to :c:func:`CVodeReInit` will
       remain active but can be disabled by calling :c:func:`CVodeClearStopTime`.
 
+.. c:function:: int CVodeSetInterpolateStopTime(void* cvode_mem, booleantype interp)
+
+   The function ``CVodeSetInterpolateStopTime`` specifies that the output solution should be
+   interpolated when the current :math:`t` equals the specified ``tstop`` (instead of
+   merely copying the internal solution :math:`y_n`).
+
+   **Arguments:**
+     * ``cvode_mem`` -- pointer to the CVODES memory block.
+     * ``interp`` -- flag indicating to use interpolation (1) or copy (0).
+
+   **Return value:**
+     * ``CV_SUCCESS`` -- The optional value has been successfully set.
+     * ``CV_MEM_NULL`` -- The CVODES memory block was not initialized through a previous call to :c:func:`CVodeCreate`.
+
+   .. versionadded:: 6.6.0
+
 .. c:function:: int CVodeClearStopTime(void* cvode_mem)
 
    Disables the stop time set with :c:func:`CVodeSetStopTime`.
@@ -1627,8 +1645,8 @@ the :c:func:`CVodeSetEpsLin` function.
 
 .. _CVODES.Usage.SIM.optional_input.optin_nls:
 
-Linear solver interface optional input functions
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Nonlinear solver interface optional input functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. _CVODES.Usage.SIM.optional_input.optin_nls_table:
 
@@ -4024,7 +4042,7 @@ solution and quadratures at time ``t``. However, :c:func:`CVode` will still retu
 only the solution :math:`y` in ``yout``. Solution quadratures can be obtained
 using the following function:
 
-.. c:function:: int CVodeGetQuad(void * cvode_mem, realtype tret, N_Vector yQ)
+.. c:function:: int CVodeGetQuad(void * cvode_mem, realtype* tret, N_Vector yQ)
 
    The function ``CVodeGetQuad`` returns the quadrature solution vector after a  successful return from ``CVode``.
 
@@ -4104,7 +4122,7 @@ If the quadrature variables are part of the step size control mechanism,
 one of the following functions must be called to specify the
 integration tolerances for quadrature variables.
 
-.. c:function:: int CVodeQuadSVtolerances(void * cvode_mem, realtype reltolQ, realtype abstolQ)
+.. c:function:: int CVodeQuadSVtolerances(void * cvode_mem, realtype reltolQ, N_Vector abstolQ)
 
    The function ``CVodeQuadSStolerances`` specifies scalar relative and absolute  tolerances.
 
diff --git a/doc/ida/guide/source/Introduction.rst b/doc/ida/guide/source/Introduction.rst
index 76458544b9..b2a5a15671 100644
--- a/doc/ida/guide/source/Introduction.rst
+++ b/doc/ida/guide/source/Introduction.rst
@@ -72,6 +72,13 @@ systems.
 Changes from previous versions
 ==============================
 
+Changes in v6.6.0
+-----------------
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
 Changes in v6.5.1
 -----------------
 
@@ -107,7 +114,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w
 as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``,
 ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions.
 Compiling and running code that uses the SUNDIALS Fortran interfaces with
-bounds checking will now work. 
+bounds checking will now work.
 
 Changes in v6.4.1
 -----------------
diff --git a/doc/ida/guide/source/Usage/index.rst b/doc/ida/guide/source/Usage/index.rst
index 25bae3ecae..c6fca6bcee 100644
--- a/doc/ida/guide/source/Usage/index.rst
+++ b/doc/ida/guide/source/Usage/index.rst
@@ -792,7 +792,7 @@ the user has set a stop time (with :c:func:`IDASetStopTime`) or requested
 rootfinding (with :c:func:`IDARootInit`).
 
 
-.. c:function:: int IDASolve(void * ida_mem, realtype tout, realtype tret, N_Vector yret, N_Vector ypret, int itask)
+.. c:function:: int IDASolve(void * ida_mem, realtype tout, realtype* tret, N_Vector yret, N_Vector ypret, int itask)
 
    The function ``IDASolve`` integrates the DAE over an interval in t.
 
diff --git a/doc/idas/guide/source/Introduction.rst b/doc/idas/guide/source/Introduction.rst
index f5b828966d..a2fd55c464 100644
--- a/doc/idas/guide/source/Introduction.rst
+++ b/doc/idas/guide/source/Introduction.rst
@@ -86,6 +86,14 @@ integrate any final-condition ODE dependent on the solution of the original IVP
 Changes from previous versions
 ==============================
 
+Changes in v5.6.0
+-----------------
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
+
 Changes in v5.5.1
 -----------------
 
@@ -121,7 +129,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w
 as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``,
 ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions.
 Compiling and running code that uses the SUNDIALS Fortran interfaces with
-bounds checking will now work. 
+bounds checking will now work.
 
 Changes in v5.4.1
 -----------------
diff --git a/doc/idas/guide/source/Usage/SIM.rst b/doc/idas/guide/source/Usage/SIM.rst
index 2fc1896241..4bd1ac2738 100644
--- a/doc/idas/guide/source/Usage/SIM.rst
+++ b/doc/idas/guide/source/Usage/SIM.rst
@@ -3970,7 +3970,7 @@ then IDAS computes both a solution and quadratures at time ``t``. However,
 :c:func:`IDASolve` will still return only the solution :math:`y` in ``y``.
 Solution quadratures can be obtained using the following function:
 
-.. c:function:: int IDAGetQuad(void * ida_mem, realtype tret, N_Vector yQ)
+.. c:function:: int IDAGetQuad(void * ida_mem, realtype* tret, N_Vector yQ)
 
    The function :c:func:`IDAGetQuad` returns the quadrature solution vector after a  successful return from :c:func:`IDASolve`.
 
diff --git a/doc/kinsol/guide/source/Introduction.rst b/doc/kinsol/guide/source/Introduction.rst
index 6e6edcffcf..3b8f5a1f21 100644
--- a/doc/kinsol/guide/source/Introduction.rst
+++ b/doc/kinsol/guide/source/Introduction.rst
@@ -88,6 +88,14 @@ applications written in Fortran.
 Changes from previous versions
 ==============================
 
+Changes in v6.6.0
+-----------------
+
+Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose`
+to support user specification of ``stdout`` and ``stderr`` strings for the output
+file names.
+
+
 Changes in v6.5.1
 -----------------
 
@@ -115,7 +123,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w
 as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``,
 ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions.
 Compiling and running code that uses the SUNDIALS Fortran interfaces with
-bounds checking will now work. 
+bounds checking will now work.
 
 Changes in v6.4.1
 -----------------
diff --git a/doc/shared/Install.rst b/doc/shared/Install.rst
index 7ad8c3bcd1..4682ba1906 100644
--- a/doc/shared/Install.rst
+++ b/doc/shared/Install.rst
@@ -776,6 +776,20 @@ illustration only.
 
    Default: none
 
+.. cmakeoption:: SUNDIALS_ONEMKL_USE_GETRF_LOOP
+
+   This advanced debugging option replaces the batched LU factorization with a
+   loop over each system in the batch and a non-batched LU factorization.
+
+   Default: OFF
+
+.. cmakeoption:: SUNDIALS_ONEMKL_USE_GETRS_LOOP
+
+   This advanced debugging option replaces the batched LU solve with a loop over
+   each system in the batch and a non-batched solve.
+
+   Default: OFF
+
 .. cmakeoption:: ENABLE_OPENMP
 
    Enable OpenMP support (build the OpenMP NVector)
@@ -944,6 +958,14 @@ illustration only.
       ``dpcpp`` and ``icpx``. When using ``icpx`` the ``-fsycl`` flag and any
       ahead of time compilation flags must be added to ``CMAKE_CXX_FLAGS``.
 
+.. cmakeoption:: SUNDIALS_SYCL_2020_UNSUPPORTED
+
+   This advanced option disables the use of *some* features from the SYCL 2020
+   standard in SUNDIALS libraries and examples. This can be used to work around
+   some cases of incomplete compiler support for SYCL 2020.
+
+   Default: OFF
+
 
 .. cmakeoption:: SUNDIALS_LOGGING_LEVEL
 
diff --git a/doc/shared/figs/arkode/ark2_dirk_stab_region.png b/doc/shared/figs/arkode/ark2_dirk_stab_region.png
new file mode 100644
index 0000000000..83929af11e
Binary files /dev/null and b/doc/shared/figs/arkode/ark2_dirk_stab_region.png differ
diff --git a/doc/shared/figs/arkode/ark2_erk_stab_region.png b/doc/shared/figs/arkode/ark2_erk_stab_region.png
new file mode 100644
index 0000000000..45b125d708
Binary files /dev/null and b/doc/shared/figs/arkode/ark2_erk_stab_region.png differ
diff --git a/doc/shared/sundials.bib b/doc/shared/sundials.bib
index 2851d968bd..d62f0d9c22 100644
--- a/doc/shared/sundials.bib
+++ b/doc/shared/sundials.bib
@@ -1784,6 +1784,19 @@ @techreport{Fehlberg:69
   year        = {1969}
 }
 
+
+@article{giraldo2013implicit,
+  title     = {Implicit-explicit formulations of a three-dimensional nonhydrostatic unified model of the atmosphere (NUMA)},
+  author    = {Giraldo, F. X. and Kelly, J. F. and Constantinescu, E. M.},
+  journal   = {SIAM Journal on Scientific Computing},
+  volume    = {35},
+  number    = {5},
+  pages     = {B1162--B1194},
+  year      = {2013},
+  publisher = {SIAM},
+  doi       = {10.1137/120876034}
+}
+
 @article{Gust:91,
   author  = {Gustafsson, K.},
   title   = {Control theoretic techniques for stepsize selection in explicit {Runge-Kutta} methods},
diff --git a/doc/shared/sundials/Fortran.rst b/doc/shared/sundials/Fortran.rst
index 20246ce8f8..bef4eb5aca 100644
--- a/doc/shared/sundials/Fortran.rst
+++ b/doc/shared/sundials/Fortran.rst
@@ -490,8 +490,10 @@ a C file pointer, SUNDIALS provides two utility functions for creating a
    the provided filename and I/O mode.
 
    **Arguments:**
-      * ``filename`` -- the full path to the file, that should have Fortran
-        type ``character(kind=C_CHAR, len=*)``.
+      * ``filename`` -- the path to the file, that should have Fortran
+        type ``character(kind=C_CHAR, len=*)``.  There are two special filenames:
+        ``stdout`` and ``stderr`` -- these two filenames will result in output
+        going to the standard output file and standard error file, respectively.
       * ``mode`` -- the I/O mode to use for the file.  This should have the
         Fortran type ``character(kind=C_CHAR, len=*)``.  The string begins
         with one of the following characters:
@@ -517,7 +519,9 @@ a C file pointer, SUNDIALS provides two utility functions for creating a
 
    **Arguments:**
       * ``fp`` -- the C ``FILE*`` that was previously obtained from ``fopen``.
-        This should have the Fortran type ``type(c_ptr)``.
+        This should have the Fortran type ``type(c_ptr)``.  Note that if either
+        ``stdout`` or ``stderr`` were opened using :c:func:`SUNDIALSFileOpen()`
+        then that stream *will not be closed* by this function.
 
 
 .. _SUNDIALS.Fortran.Portability:
diff --git a/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp b/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp
index 6c176ee53e..45eae942b2 100644
--- a/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp
+++ b/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp
@@ -129,7 +129,7 @@ int main(int argc, char *argv[])
   if (argc > 3) output = (atoi(argv[3])) ? true : false;
 
   // Create an in-order GPU queue
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else
diff --git a/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp b/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp
index dc3286701e..44fe80d7b7 100644
--- a/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp
+++ b/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp
@@ -119,7 +119,7 @@ int main(int argc, char** argv)
   int retval;
 
   // Create an in-order GPU queue
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else
diff --git a/examples/cvode/kokkos/CMakeLists.txt b/examples/cvode/kokkos/CMakeLists.txt
index 84d0e1528f..3b20ec3cd4 100644
--- a/examples/cvode/kokkos/CMakeLists.txt
+++ b/examples/cvode/kokkos/CMakeLists.txt
@@ -16,6 +16,7 @@
 # 'develop' for examples excluded from 'make test' in releases
 set(examples_list
   "cv_bruss_batched_kokkos.cpp\;\;develop"
+  "cv_bruss_batched_kokkos_2D.cpp\;\;develop"
 )
 
 # Add the build targets for each example
diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out
new file mode 100644
index 0000000000..6f2c19c3a9
--- /dev/null
+++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out
@@ -0,0 +1,137 @@
+
+Batch of independent 3-species kinetics problems
+  number of batches = 100
+  linear solver     = KokkosKernels
+  test type         = 2
+  execution space   = Cuda
+
+At t = 0
+  batch 0: y = 1.2 3.1 3
+  batch 10: y = 1.2 3.1 3
+  batch 20: y = 1.2 3.1 3
+  batch 30: y = 1.2 3.1 3
+  batch 40: y = 1.2 3.1 3
+  batch 50: y = 1.2 3.1 3
+  batch 60: y = 1.2 3.1 3
+  batch 70: y = 1.2 3.1 3
+  batch 80: y = 1.2 3.1 3
+  batch 90: y = 1.2 3.1 3
+At t = 1
+  batch 0: y = 1.10389 3.01314 3.49998
+  batch 10: y = 1.10389 3.01314 3.49998
+  batch 20: y = 1.10389 3.01314 3.49998
+  batch 30: y = 1.10389 3.01314 3.49998
+  batch 40: y = 1.10389 3.01314 3.49998
+  batch 50: y = 1.10389 3.01314 3.49998
+  batch 60: y = 1.10389 3.01314 3.49998
+  batch 70: y = 1.10389 3.01314 3.49998
+  batch 80: y = 1.10389 3.01314 3.49998
+  batch 90: y = 1.10389 3.01314 3.49998
+At t = 2
+  batch 0: y = 0.688033 3.5213 3.49999
+  batch 10: y = 0.688033 3.5213 3.49999
+  batch 20: y = 0.688033 3.5213 3.49999
+  batch 30: y = 0.688033 3.5213 3.49999
+  batch 40: y = 0.688033 3.5213 3.49999
+  batch 50: y = 0.688033 3.5213 3.49999
+  batch 60: y = 0.688033 3.5213 3.49999
+  batch 70: y = 0.688033 3.5213 3.49999
+  batch 80: y = 0.688033 3.5213 3.49999
+  batch 90: y = 0.688033 3.5213 3.49999
+At t = 3
+  batch 0: y = 0.409472 4.27781 3.49999
+  batch 10: y = 0.409472 4.27781 3.49999
+  batch 20: y = 0.409472 4.27781 3.49999
+  batch 30: y = 0.409472 4.27781 3.49999
+  batch 40: y = 0.409472 4.27781 3.49999
+  batch 50: y = 0.409472 4.27781 3.49999
+  batch 60: y = 0.409472 4.27781 3.49999
+  batch 70: y = 0.409472 4.27781 3.49999
+  batch 80: y = 0.409472 4.27781 3.49999
+  batch 90: y = 0.409472 4.27781 3.49999
+At t = 4
+  batch 0: y = 0.36788 4.94194 3.49999
+  batch 10: y = 0.36788 4.94194 3.49999
+  batch 20: y = 0.36788 4.94194 3.49999
+  batch 30: y = 0.36788 4.94194 3.49999
+  batch 40: y = 0.36788 4.94194 3.49999
+  batch 50: y = 0.36788 4.94194 3.49999
+  batch 60: y = 0.36788 4.94194 3.49999
+  batch 70: y = 0.36788 4.94194 3.49999
+  batch 80: y = 0.36788 4.94194 3.49999
+  batch 90: y = 0.36788 4.94194 3.49999
+At t = 5
+  batch 0: y = 0.413842 5.51057 3.49999
+  batch 10: y = 0.413842 5.51057 3.49999
+  batch 20: y = 0.413842 5.51057 3.49999
+  batch 30: y = 0.413842 5.51057 3.49999
+  batch 40: y = 0.413842 5.51057 3.49999
+  batch 50: y = 0.413842 5.51057 3.49999
+  batch 60: y = 0.413842 5.51057 3.49999
+  batch 70: y = 0.413842 5.51057 3.49999
+  batch 80: y = 0.413842 5.51057 3.49999
+  batch 90: y = 0.413842 5.51057 3.49999
+At t = 6
+  batch 0: y = 0.589207 5.85566 3.49999
+  batch 10: y = 0.589207 5.85566 3.49999
+  batch 20: y = 0.589207 5.85566 3.49999
+  batch 30: y = 0.589207 5.85566 3.49999
+  batch 40: y = 0.589207 5.85566 3.49999
+  batch 50: y = 0.589207 5.85566 3.49999
+  batch 60: y = 0.589207 5.85566 3.49999
+  batch 70: y = 0.589207 5.85566 3.49999
+  batch 80: y = 0.589207 5.85566 3.49999
+  batch 90: y = 0.589207 5.85566 3.49999
+At t = 7
+  batch 0: y = 4.75675 0.735405 3.49992
+  batch 10: y = 4.75675 0.735405 3.49992
+  batch 20: y = 4.75675 0.735405 3.49992
+  batch 30: y = 4.75675 0.735405 3.49992
+  batch 40: y = 4.75675 0.735405 3.49992
+  batch 50: y = 4.75675 0.735405 3.49992
+  batch 60: y = 4.75675 0.735405 3.49992
+  batch 70: y = 4.75675 0.735405 3.49992
+  batch 80: y = 4.75675 0.735405 3.49992
+  batch 90: y = 4.75675 0.735405 3.49992
+At t = 8
+  batch 0: y = 1.81355 1.57573 3.49997
+  batch 10: y = 1.81355 1.57573 3.49997
+  batch 20: y = 1.81355 1.57573 3.49997
+  batch 30: y = 1.81355 1.57573 3.49997
+  batch 40: y = 1.81355 1.57573 3.49997
+  batch 50: y = 1.81355 1.57573 3.49997
+  batch 60: y = 1.81355 1.57573 3.49997
+  batch 70: y = 1.81355 1.57573 3.49997
+  batch 80: y = 1.81355 1.57573 3.49997
+  batch 90: y = 1.81355 1.57573 3.49997
+At t = 9
+  batch 0: y = 0.527935 2.80731 3.49999
+  batch 10: y = 0.527935 2.80731 3.49999
+  batch 20: y = 0.527935 2.80731 3.49999
+  batch 30: y = 0.527935 2.80731 3.49999
+  batch 40: y = 0.527935 2.80731 3.49999
+  batch 50: y = 0.527935 2.80731 3.49999
+  batch 60: y = 0.527935 2.80731 3.49999
+  batch 70: y = 0.527935 2.80731 3.49999
+  batch 80: y = 0.527935 2.80731 3.49999
+  batch 90: y = 0.527935 2.80731 3.49999
+At t = 10
+  batch 0: y = 0.305602 3.65734 3.49999
+  batch 10: y = 0.305602 3.65734 3.49999
+  batch 20: y = 0.305602 3.65734 3.49999
+  batch 30: y = 0.305602 3.65734 3.49999
+  batch 40: y = 0.305602 3.65734 3.49999
+  batch 50: y = 0.305602 3.65734 3.49999
+  batch 60: y = 0.305602 3.65734 3.49999
+  batch 70: y = 0.305602 3.65734 3.49999
+  batch 80: y = 0.305602 3.65734 3.49999
+  batch 90: y = 0.305602 3.65734 3.49999
+
+Final Statistics:
+  Steps            = 344
+  RHS evals        = 464
+  LS setups        = 59
+  Jac evals        = 7
+  NLS iters        = 461
+  NLS fails        = 1
+  Error test fails = 20
diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out
new file mode 100644
index 0000000000..69f0b74a18
--- /dev/null
+++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out
@@ -0,0 +1,137 @@
+
+Batch of independent 3-species kinetics problems
+  number of batches = 100
+  linear solver     = KokkosKernels
+  test type         = 2
+  execution space   = OpenMP
+
+At t = 0
+  batch 0: y = 1.2 3.1 3
+  batch 10: y = 1.2 3.1 3
+  batch 20: y = 1.2 3.1 3
+  batch 30: y = 1.2 3.1 3
+  batch 40: y = 1.2 3.1 3
+  batch 50: y = 1.2 3.1 3
+  batch 60: y = 1.2 3.1 3
+  batch 70: y = 1.2 3.1 3
+  batch 80: y = 1.2 3.1 3
+  batch 90: y = 1.2 3.1 3
+At t = 1
+  batch 0: y = 1.10389 3.01314 3.49998
+  batch 10: y = 1.10389 3.01314 3.49998
+  batch 20: y = 1.10389 3.01314 3.49998
+  batch 30: y = 1.10389 3.01314 3.49998
+  batch 40: y = 1.10389 3.01314 3.49998
+  batch 50: y = 1.10389 3.01314 3.49998
+  batch 60: y = 1.10389 3.01314 3.49998
+  batch 70: y = 1.10389 3.01314 3.49998
+  batch 80: y = 1.10389 3.01314 3.49998
+  batch 90: y = 1.10389 3.01314 3.49998
+At t = 2
+  batch 0: y = 0.688033 3.5213 3.49999
+  batch 10: y = 0.688033 3.5213 3.49999
+  batch 20: y = 0.688033 3.5213 3.49999
+  batch 30: y = 0.688033 3.5213 3.49999
+  batch 40: y = 0.688033 3.5213 3.49999
+  batch 50: y = 0.688033 3.5213 3.49999
+  batch 60: y = 0.688033 3.5213 3.49999
+  batch 70: y = 0.688033 3.5213 3.49999
+  batch 80: y = 0.688033 3.5213 3.49999
+  batch 90: y = 0.688033 3.5213 3.49999
+At t = 3
+  batch 0: y = 0.409472 4.27781 3.49999
+  batch 10: y = 0.409472 4.27781 3.49999
+  batch 20: y = 0.409472 4.27781 3.49999
+  batch 30: y = 0.409472 4.27781 3.49999
+  batch 40: y = 0.409472 4.27781 3.49999
+  batch 50: y = 0.409472 4.27781 3.49999
+  batch 60: y = 0.409472 4.27781 3.49999
+  batch 70: y = 0.409472 4.27781 3.49999
+  batch 80: y = 0.409472 4.27781 3.49999
+  batch 90: y = 0.409472 4.27781 3.49999
+At t = 4
+  batch 0: y = 0.36788 4.94194 3.49999
+  batch 10: y = 0.36788 4.94194 3.49999
+  batch 20: y = 0.36788 4.94194 3.49999
+  batch 30: y = 0.36788 4.94194 3.49999
+  batch 40: y = 0.36788 4.94194 3.49999
+  batch 50: y = 0.36788 4.94194 3.49999
+  batch 60: y = 0.36788 4.94194 3.49999
+  batch 70: y = 0.36788 4.94194 3.49999
+  batch 80: y = 0.36788 4.94194 3.49999
+  batch 90: y = 0.36788 4.94194 3.49999
+At t = 5
+  batch 0: y = 0.413842 5.51057 3.49999
+  batch 10: y = 0.413842 5.51057 3.49999
+  batch 20: y = 0.413842 5.51057 3.49999
+  batch 30: y = 0.413842 5.51057 3.49999
+  batch 40: y = 0.413842 5.51057 3.49999
+  batch 50: y = 0.413842 5.51057 3.49999
+  batch 60: y = 0.413842 5.51057 3.49999
+  batch 70: y = 0.413842 5.51057 3.49999
+  batch 80: y = 0.413842 5.51057 3.49999
+  batch 90: y = 0.413842 5.51057 3.49999
+At t = 6
+  batch 0: y = 0.589207 5.85566 3.49999
+  batch 10: y = 0.589207 5.85566 3.49999
+  batch 20: y = 0.589207 5.85566 3.49999
+  batch 30: y = 0.589207 5.85566 3.49999
+  batch 40: y = 0.589207 5.85566 3.49999
+  batch 50: y = 0.589207 5.85566 3.49999
+  batch 60: y = 0.589207 5.85566 3.49999
+  batch 70: y = 0.589207 5.85566 3.49999
+  batch 80: y = 0.589207 5.85566 3.49999
+  batch 90: y = 0.589207 5.85566 3.49999
+At t = 7
+  batch 0: y = 4.75675 0.735405 3.49992
+  batch 10: y = 4.75675 0.735405 3.49992
+  batch 20: y = 4.75675 0.735405 3.49992
+  batch 30: y = 4.75675 0.735405 3.49992
+  batch 40: y = 4.75675 0.735405 3.49992
+  batch 50: y = 4.75675 0.735405 3.49992
+  batch 60: y = 4.75675 0.735405 3.49992
+  batch 70: y = 4.75675 0.735405 3.49992
+  batch 80: y = 4.75675 0.735405 3.49992
+  batch 90: y = 4.75675 0.735405 3.49992
+At t = 8
+  batch 0: y = 1.81355 1.57573 3.49997
+  batch 10: y = 1.81355 1.57573 3.49997
+  batch 20: y = 1.81355 1.57573 3.49997
+  batch 30: y = 1.81355 1.57573 3.49997
+  batch 40: y = 1.81355 1.57573 3.49997
+  batch 50: y = 1.81355 1.57573 3.49997
+  batch 60: y = 1.81355 1.57573 3.49997
+  batch 70: y = 1.81355 1.57573 3.49997
+  batch 80: y = 1.81355 1.57573 3.49997
+  batch 90: y = 1.81355 1.57573 3.49997
+At t = 9
+  batch 0: y = 0.527935 2.80731 3.49999
+  batch 10: y = 0.527935 2.80731 3.49999
+  batch 20: y = 0.527935 2.80731 3.49999
+  batch 30: y = 0.527935 2.80731 3.49999
+  batch 40: y = 0.527935 2.80731 3.49999
+  batch 50: y = 0.527935 2.80731 3.49999
+  batch 60: y = 0.527935 2.80731 3.49999
+  batch 70: y = 0.527935 2.80731 3.49999
+  batch 80: y = 0.527935 2.80731 3.49999
+  batch 90: y = 0.527935 2.80731 3.49999
+At t = 10
+  batch 0: y = 0.305602 3.65734 3.49999
+  batch 10: y = 0.305602 3.65734 3.49999
+  batch 20: y = 0.305602 3.65734 3.49999
+  batch 30: y = 0.305602 3.65734 3.49999
+  batch 40: y = 0.305602 3.65734 3.49999
+  batch 50: y = 0.305602 3.65734 3.49999
+  batch 60: y = 0.305602 3.65734 3.49999
+  batch 70: y = 0.305602 3.65734 3.49999
+  batch 80: y = 0.305602 3.65734 3.49999
+  batch 90: y = 0.305602 3.65734 3.49999
+
+Final Statistics:
+  Steps            = 344
+  RHS evals        = 464
+  LS setups        = 59
+  Jac evals        = 7
+  NLS iters        = 461
+  NLS fails        = 1
+  Error test fails = 20
diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out
new file mode 100644
index 0000000000..6cabd0d57d
--- /dev/null
+++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out
@@ -0,0 +1,137 @@
+
+Batch of independent 3-species kinetics problems
+  number of batches = 100
+  linear solver     = KokkosKernels
+  test type         = 2
+  execution space   = Serial
+
+At t = 0
+  batch 0: y = 1.2 3.1 3
+  batch 10: y = 1.2 3.1 3
+  batch 20: y = 1.2 3.1 3
+  batch 30: y = 1.2 3.1 3
+  batch 40: y = 1.2 3.1 3
+  batch 50: y = 1.2 3.1 3
+  batch 60: y = 1.2 3.1 3
+  batch 70: y = 1.2 3.1 3
+  batch 80: y = 1.2 3.1 3
+  batch 90: y = 1.2 3.1 3
+At t = 1
+  batch 0: y = 1.10389 3.01314 3.49998
+  batch 10: y = 1.10389 3.01314 3.49998
+  batch 20: y = 1.10389 3.01314 3.49998
+  batch 30: y = 1.10389 3.01314 3.49998
+  batch 40: y = 1.10389 3.01314 3.49998
+  batch 50: y = 1.10389 3.01314 3.49998
+  batch 60: y = 1.10389 3.01314 3.49998
+  batch 70: y = 1.10389 3.01314 3.49998
+  batch 80: y = 1.10389 3.01314 3.49998
+  batch 90: y = 1.10389 3.01314 3.49998
+At t = 2
+  batch 0: y = 0.688033 3.5213 3.49999
+  batch 10: y = 0.688033 3.5213 3.49999
+  batch 20: y = 0.688033 3.5213 3.49999
+  batch 30: y = 0.688033 3.5213 3.49999
+  batch 40: y = 0.688033 3.5213 3.49999
+  batch 50: y = 0.688033 3.5213 3.49999
+  batch 60: y = 0.688033 3.5213 3.49999
+  batch 70: y = 0.688033 3.5213 3.49999
+  batch 80: y = 0.688033 3.5213 3.49999
+  batch 90: y = 0.688033 3.5213 3.49999
+At t = 3
+  batch 0: y = 0.409472 4.27781 3.49999
+  batch 10: y = 0.409472 4.27781 3.49999
+  batch 20: y = 0.409472 4.27781 3.49999
+  batch 30: y = 0.409472 4.27781 3.49999
+  batch 40: y = 0.409472 4.27781 3.49999
+  batch 50: y = 0.409472 4.27781 3.49999
+  batch 60: y = 0.409472 4.27781 3.49999
+  batch 70: y = 0.409472 4.27781 3.49999
+  batch 80: y = 0.409472 4.27781 3.49999
+  batch 90: y = 0.409472 4.27781 3.49999
+At t = 4
+  batch 0: y = 0.36788 4.94194 3.49999
+  batch 10: y = 0.36788 4.94194 3.49999
+  batch 20: y = 0.36788 4.94194 3.49999
+  batch 30: y = 0.36788 4.94194 3.49999
+  batch 40: y = 0.36788 4.94194 3.49999
+  batch 50: y = 0.36788 4.94194 3.49999
+  batch 60: y = 0.36788 4.94194 3.49999
+  batch 70: y = 0.36788 4.94194 3.49999
+  batch 80: y = 0.36788 4.94194 3.49999
+  batch 90: y = 0.36788 4.94194 3.49999
+At t = 5
+  batch 0: y = 0.413842 5.51057 3.49999
+  batch 10: y = 0.413842 5.51057 3.49999
+  batch 20: y = 0.413842 5.51057 3.49999
+  batch 30: y = 0.413842 5.51057 3.49999
+  batch 40: y = 0.413842 5.51057 3.49999
+  batch 50: y = 0.413842 5.51057 3.49999
+  batch 60: y = 0.413842 5.51057 3.49999
+  batch 70: y = 0.413842 5.51057 3.49999
+  batch 80: y = 0.413842 5.51057 3.49999
+  batch 90: y = 0.413842 5.51057 3.49999
+At t = 6
+  batch 0: y = 0.589207 5.85566 3.49999
+  batch 10: y = 0.589207 5.85566 3.49999
+  batch 20: y = 0.589207 5.85566 3.49999
+  batch 30: y = 0.589207 5.85566 3.49999
+  batch 40: y = 0.589207 5.85566 3.49999
+  batch 50: y = 0.589207 5.85566 3.49999
+  batch 60: y = 0.589207 5.85566 3.49999
+  batch 70: y = 0.589207 5.85566 3.49999
+  batch 80: y = 0.589207 5.85566 3.49999
+  batch 90: y = 0.589207 5.85566 3.49999
+At t = 7
+  batch 0: y = 4.75675 0.735405 3.49992
+  batch 10: y = 4.75675 0.735405 3.49992
+  batch 20: y = 4.75675 0.735405 3.49992
+  batch 30: y = 4.75675 0.735405 3.49992
+  batch 40: y = 4.75675 0.735405 3.49992
+  batch 50: y = 4.75675 0.735405 3.49992
+  batch 60: y = 4.75675 0.735405 3.49992
+  batch 70: y = 4.75675 0.735405 3.49992
+  batch 80: y = 4.75675 0.735405 3.49992
+  batch 90: y = 4.75675 0.735405 3.49992
+At t = 8
+  batch 0: y = 1.81355 1.57573 3.49997
+  batch 10: y = 1.81355 1.57573 3.49997
+  batch 20: y = 1.81355 1.57573 3.49997
+  batch 30: y = 1.81355 1.57573 3.49997
+  batch 40: y = 1.81355 1.57573 3.49997
+  batch 50: y = 1.81355 1.57573 3.49997
+  batch 60: y = 1.81355 1.57573 3.49997
+  batch 70: y = 1.81355 1.57573 3.49997
+  batch 80: y = 1.81355 1.57573 3.49997
+  batch 90: y = 1.81355 1.57573 3.49997
+At t = 9
+  batch 0: y = 0.527935 2.80731 3.49999
+  batch 10: y = 0.527935 2.80731 3.49999
+  batch 20: y = 0.527935 2.80731 3.49999
+  batch 30: y = 0.527935 2.80731 3.49999
+  batch 40: y = 0.527935 2.80731 3.49999
+  batch 50: y = 0.527935 2.80731 3.49999
+  batch 60: y = 0.527935 2.80731 3.49999
+  batch 70: y = 0.527935 2.80731 3.49999
+  batch 80: y = 0.527935 2.80731 3.49999
+  batch 90: y = 0.527935 2.80731 3.49999
+At t = 10
+  batch 0: y = 0.305602 3.65734 3.49999
+  batch 10: y = 0.305602 3.65734 3.49999
+  batch 20: y = 0.305602 3.65734 3.49999
+  batch 30: y = 0.305602 3.65734 3.49999
+  batch 40: y = 0.305602 3.65734 3.49999
+  batch 50: y = 0.305602 3.65734 3.49999
+  batch 60: y = 0.305602 3.65734 3.49999
+  batch 70: y = 0.305602 3.65734 3.49999
+  batch 80: y = 0.305602 3.65734 3.49999
+  batch 90: y = 0.305602 3.65734 3.49999
+
+Final Statistics:
+  Steps            = 344
+  RHS evals        = 464
+  LS setups        = 59
+  Jac evals        = 7
+  NLS iters        = 461
+  NLS fails        = 1
+  Error test fails = 20
diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp
new file mode 100644
index 0000000000..58a136a74f
--- /dev/null
+++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp
@@ -0,0 +1,425 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Daniel R. Reynolds @ SMU
+ *                David J. Gardner and Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------
+ * The following is a simple example problem based off of ark_brusselator.c.
+ *
+ * We simulate a scenario where a set of independent ODEs are batched together
+ * to form a larger system. Each independent ODE system has 3 components,
+ * Y = [u, v, w], satisfying the equations,
+ *
+ *   du/dt = a - (w + 1) * u + v * u^2
+ *   dv/dt = w * u - v * u^2
+ *   dw/dt = (b - w) / ep - w * u
+ *
+ * for t in the interval [0, 10], with initial conditions Y0 = [u0, v0, w0].
+ * The problem is stiff and there are 3 testing scenarios:
+ *
+ * Reactor 0: u0 = 3.9, v0 = 1.1, w0 = 2.8, a = 1.2, b = 2.5, ep = 1.0e-5
+ *   Here, all three components exhibit a rapid transient change during the
+ *   first 0.2 time units, followed by a slow and smooth evolution.
+ *
+ * Reactor 1: u0 = 3, v0 = 3, w0 = 3.5, a = 0.5, b = 3, ep = 5.0e-4
+ *   Here, all components undergo very rapid initial transients during the first
+ *   0.3 time units, and all then proceed very smoothly for the remainder of the
+ *   simulation.
+ *
+ * Reactor 2: u0 = 1.2, v0 = 3.1, w0 = 3, a = 1, b = 3.5, ep = 5.0e-6
+ *   Here, w experiences a fast initial transient, jumping 0.5 within a few
+ *   steps. All values proceed smoothly until around t=6.5, when both u and v
+ *   undergo a sharp transition, with u increasing from around 0.5 to 5 and v
+ *   decreasing from around 6 to 1 in less than 0.5 time units. After this
+ *   transition, both u and v continue to evolve somewhat rapidly for another
+ *   1.4 time units, and finish off smoothly.
+ *
+ * This program solves the problem with the BDF method, Newton iteration, a
+ * user-supplied Jacobian routine, and, since the grouping of the independent
+ * systems results in a block diagonal linear system, the dense KOKKOS
+ * SUNLinearSolver which supports batched systems. 100 outputs are printed at
+ * equal intervals, and run statistics are printed at the end.
+ *
+ * Unlike the example cv_bruss_batched_kokkos.cpp, this example utilizes Kokkos'
+ * multi-dimensional view functionality to consider a 2D grouping, y(i,j), where
+ * i corresponds with the batch index, and j corresponds to the component (u,v,w).
+ *
+ * The program takes three optional arguments, the number of independent ODE
+ * systems (i.e., number of batches), the linear solver type (KOKKOS batched LU
+ * or non-batched GMRES with the Jacobian computed by difference quotients)
+ * the test type (uniform_0, uniform_1, or  uniform_2).
+ *
+ *   ./cv_bruss_batched_kokkos [num_batches] [solver_type] [test_type]
+ *
+ * Options:
+ *   num_batches <int>
+ *   solver_type:
+ *     0 - KOKKOS batched LU (default)
+ *     1 - SUNDIALS non-batched GMRES with difference quotients Jacobian
+ *   test_type:
+ *     0 - uniform_0, all batches are Reactor 0
+ *     1 - uniform 1, all batches are Reactor 1
+ *     2 - uniform 2, all batches are Reactor 2 (default)
+ * ---------------------------------------------------------------------------*/
+
+#include <cstdio>
+#include <cvode/cvode.h>
+#include <memory>
+#include <nvector/nvector_kokkos.hpp>
+#include <sunlinsol/sunlinsol_kokkosdense.hpp>
+#include <sunlinsol/sunlinsol_spgmr.h>
+#include <sunmatrix/sunmatrix_kokkosdense.hpp>
+#include <vector>
+
+// Common utility functions
+#include <example_utilities.hpp>
+
+// Execution space
+#if defined(USE_CUDA)
+using ExecSpace = Kokkos::Cuda;
+using MemSpace  = Kokkos::CudaSpace;
+#elif defined(USE_HIP)
+#if KOKKOS_VERSION / 10000 > 3
+using ExecSpace = Kokkos::HIP;
+using MemSpace  = Kokkos::HIPSpace;
+#else
+using ExecSpace = Kokkos::Experimental::HIP;
+using MemSpace  = Kokkos::Experimental::HIPSpace;
+#endif
+#elif defined(USE_OPENMP)
+using ExecSpace = Kokkos::OpenMP;
+using MemSpace  = Kokkos::HostSpace;
+#else
+using ExecSpace = Kokkos::Serial;
+using MemSpace  = Kokkos::HostSpace;
+#endif
+
+using Vec1D     = Kokkos::View<realtype*, MemSpace>;
+using Vec2D     = Kokkos::View<realtype**, Kokkos::LayoutRight, MemSpace>;
+using Vec2DHost = Vec2D::HostMirror;
+using VecType   = sundials::kokkos::Vector<ExecSpace>;
+using MatType   = sundials::kokkos::DenseMatrix<ExecSpace>;
+using LSType    = sundials::kokkos::DenseLinearSolver<ExecSpace>;
+using SizeType  = VecType::size_type;
+
+// Constants
+#define ZERO SUN_RCONST(0.0)
+#define ONE  SUN_RCONST(1.0)
+#define TWO  SUN_RCONST(2.0)
+
+// User-supplied functions called by CVODE
+static int f(sunrealtype t, N_Vector y, N_Vector ydot, void* user_data);
+
+static int Jac(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J,
+               void* user_data, N_Vector tmp1, N_Vector tmp2, N_Vector tmp3);
+
+// User data structure available in user-supplied callback functions
+struct UserData
+{
+  int nbatches  = 100; // number of chemical networks
+  int batchSize = 3;   // size of each network
+  sunrealtype a, b;    // chemical concentrations that are constant
+  sunrealtype ep;      // stiffness parameter
+};
+
+/* -----------------------------------------------------------------------------
+ * Main Program
+ * ---------------------------------------------------------------------------*/
+
+int main(int argc, char* argv[])
+{
+  // Create the SUNDIALS context
+  sundials::Context sunctx;
+
+  Kokkos::initialize(argc, argv);
+  {
+    // Create UserData
+    UserData udata;
+
+    // Parse command line options
+    int argi = 0;
+
+    // Total number of batch systems
+    if (argc > 1) udata.nbatches = atoi(argv[++argi]);
+
+    // Linear solver type
+    int solver_type = 0;
+    if (argc > 2) solver_type = atoi(argv[++argi]);
+
+    // Problem setup
+    int test_type = 2;
+    if (argc > 3) test_type = atoi(argv[++argi]);
+
+    // Shortcuts
+    int nbatches  = udata.nbatches;
+    int batchSize = udata.batchSize;
+
+    std::cout << "\nBatch of independent 3-species kinetics problems\n"
+              << "  number of batches = " << nbatches << "\n"
+              << "  linear solver     = "
+              << (solver_type ? "GMRES" : "KokkosKernels") << "\n"
+              << "  test type         = " << test_type << "\n"
+              << "  execution space   = " << ExecSpace().name() << "\n\n";
+
+    sunrealtype u0, v0, w0;
+    if (test_type == 0)
+    {
+      u0 = SUN_RCONST(3.9);
+      v0 = SUN_RCONST(1.1);
+      w0 = SUN_RCONST(2.8);
+
+      udata.a  = SUN_RCONST(1.2);
+      udata.b  = SUN_RCONST(2.5);
+      udata.ep = SUN_RCONST(1.0e-5);
+    }
+    else if (test_type == 1)
+    {
+      u0 = SUN_RCONST(3.0);
+      v0 = SUN_RCONST(3.0);
+      w0 = SUN_RCONST(3.5);
+
+      udata.a  = SUN_RCONST(0.5);
+      udata.b  = SUN_RCONST(3.0);
+      udata.ep = SUN_RCONST(5.0e-4);
+    }
+    else if (test_type == 2)
+    {
+      u0 = SUN_RCONST(1.2);
+      v0 = SUN_RCONST(3.1);
+      w0 = SUN_RCONST(3.0);
+
+      udata.a  = SUN_RCONST(1.0);
+      udata.b  = SUN_RCONST(3.5);
+      udata.ep = SUN_RCONST(5.0e-6);
+    }
+    else
+    {
+      std::cerr << "ERROR: Invalid test type option\n";
+      return -1;
+    }
+
+    // Create vector with the initial condition
+    const sunrealtype T0 = SUN_RCONST(0.0);
+
+    SizeType length{static_cast<SizeType>(batchSize * nbatches)};
+    VecType y{length, sunctx};
+    Vec2D y2d((y.View()).data(), nbatches, batchSize);
+
+    Kokkos::parallel_for(
+      "fill_y", Kokkos::RangePolicy<ExecSpace>(0, nbatches),
+      KOKKOS_LAMBDA(const SizeType i) {
+        y2d(i,0) = u0;
+        y2d(i,1) = v0;
+        y2d(i,2) = w0;
+      });
+
+    // Create vector of absolute tolerances
+    VecType abstol{length, sunctx};
+    N_VConst(SUN_RCONST(1.0e-10), abstol);
+
+    // Create CVODE using Backward Differentiation Formula methods
+    void* cvode_mem = CVodeCreate(CV_BDF, sunctx);
+    if (check_ptr(cvode_mem, "CVodeCreate")) { return 1; }
+
+    // Initialize the integrator and set the ODE right-hand side function
+    int retval = CVodeInit(cvode_mem, f, T0, y);
+    if (check_flag(retval, "CVodeInit")) { return 1; }
+
+    // Attach the user data structure
+    retval = CVodeSetUserData(cvode_mem, &udata);
+    if (check_flag(retval, "CVodeSetUserData")) { return 1; }
+
+    // Specify the scalar relative tolerance and vector absolute tolerances
+    retval = CVodeSVtolerances(cvode_mem, SUN_RCONST(1.0e-6), abstol);
+    if (check_flag(retval, "CVodeSVtolerances")) { return 1; }
+
+    // Create the matrix and linear solver objects
+    std::unique_ptr<sundials::ConvertibleTo<SUNMatrix>> A;
+    std::unique_ptr<sundials::ConvertibleTo<SUNLinearSolver>> LS;
+
+    if (solver_type == 0)
+    {
+      // Create Kokkos dense block diagonal matrix
+      A = std::make_unique<MatType>(nbatches, batchSize, batchSize, sunctx);
+
+      // Create Kokkos batched dense linear solver
+      LS = std::make_unique<LSType>(sunctx);
+
+      // Attach the matrix and linear solver to CVODE
+      retval = CVodeSetLinearSolver(cvode_mem, LS->Convert(), A->Convert());
+      if (check_flag(retval, "CVodeSetLinearSolver")) return 1;
+
+      // Set the user-supplied Jacobian function
+      retval = CVodeSetJacFn(cvode_mem, Jac);
+      if (check_flag(retval, "CVodeSetJacFn")) return 1;
+    }
+    else
+    {
+      // Create matrix-free GMRES linear solver
+      LS = std::make_unique<sundials::experimental::SUNLinearSolverView>(
+        SUNLinSol_SPGMR(y, SUN_PREC_NONE, 0, sunctx));
+
+      // Attach the linear solver to CVODE
+      retval = CVodeSetLinearSolver(cvode_mem, LS->Convert(), nullptr);
+      if (check_flag(retval, "CVodeSetLinearSolver")) return 1;
+    }
+
+    // Final time and time between outputs
+    const sunrealtype Tf    = SUN_RCONST(10.0);
+    const sunrealtype dTout = SUN_RCONST(1.0);
+
+    // Number of output times
+    const int Nt = static_cast<int>(ceil(Tf / dTout));
+
+    // Current time and first output time
+    sunrealtype t    = T0;
+    sunrealtype tout = T0 + dTout;
+
+    // Initial output
+    Vec2DHost y2d_h((y.HostView()).data(), nbatches, batchSize);
+    sundials::kokkos::CopyFromDevice(y);
+    Kokkos::fence();
+    std::cout << "At t = " << t << std::endl;
+    for (int j = 0; j < nbatches; j += 10)
+    {
+      std::cout << "  batch " << j << ": y = " << y2d_h(j,0) << " "
+                << y2d_h(j,1) << " " << y2d_h(j,2) << std::endl;
+    }
+
+    // Loop over output times
+    for (int iout = 0; iout < Nt; iout++)
+    {
+      // Advance in time
+      retval = CVode(cvode_mem, tout, y, &t, CV_NORMAL);
+      if (check_flag(retval, "CVode")) break;
+
+      // Output solution from some batches
+      sundials::kokkos::CopyFromDevice(y);
+      Kokkos::fence();
+      std::cout << "At t = " << t << std::endl;
+      for (int j = 0; j < nbatches; j += 10)
+      {
+        std::cout << "  batch " << j << ": y = " << y2d_h(j,0) << " "
+                  << y2d_h(j,1) << " " << y2d_h(j,2) << std::endl;
+      }
+
+      tout += dTout;
+      tout = (tout > Tf) ? Tf : tout;
+    }
+
+    // Print some final statistics
+    long int nst, nfe, nsetups, nje, nni, ncfn, netf;
+
+    retval = CVodeGetNumSteps(cvode_mem, &nst);
+    check_flag(retval, "CVodeGetNumSteps");
+    retval = CVodeGetNumRhsEvals(cvode_mem, &nfe);
+    check_flag(retval, "CVodeGetNumRhsEvals");
+    retval = CVodeGetNumLinSolvSetups(cvode_mem, &nsetups);
+    check_flag(retval, "CVodeGetNumLinSolvSetups");
+    retval = CVodeGetNumErrTestFails(cvode_mem, &netf);
+    check_flag(retval, "CVodeGetNumErrTestFails");
+    retval = CVodeGetNumNonlinSolvIters(cvode_mem, &nni);
+    check_flag(retval, "CVodeGetNumNonlinSolvIters");
+    retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncfn);
+    check_flag(retval, "CVodeGetNumNonlinSolvConvFails");
+    retval = CVodeGetNumJacEvals(cvode_mem, &nje);
+    check_flag(retval, "CVodeGetNumJacEvals");
+
+    std::cout << "\nFinal Statistics:\n"
+              << "  Steps            = " << nst << "\n"
+              << "  RHS evals        = " << nfe << "\n"
+              << "  LS setups        = " << nsetups << "\n"
+              << "  Jac evals        = " << nje << "\n"
+              << "  NLS iters        = " << nni << "\n"
+              << "  NLS fails        = " << ncfn << "\n"
+              << "  Error test fails = " << netf << "\n";
+
+    // Free objects
+    CVodeFree(&cvode_mem);
+  }
+  Kokkos::finalize();
+
+  return 0;
+}
+
+/* -----------------------------------------------------------------------------
+ * User-supplied functions called by CVODE
+ * ---------------------------------------------------------------------------*/
+
+// Right hand side function dy/dt = f(t,y)
+int f(sunrealtype t, N_Vector y, N_Vector ydot, void* user_data)
+{
+  auto udata = static_cast<UserData*>(user_data);
+
+  const auto nbatches  = udata->nbatches;
+  const auto batchSize = udata->batchSize;
+
+  const auto a  = udata->a;
+  const auto b  = udata->b;
+  const auto ep = udata->ep;
+
+  Vec2D y2d(N_VGetDeviceArrayPointer(y), nbatches, batchSize);
+  Vec2D ydot2d(N_VGetDeviceArrayPointer(ydot), nbatches, batchSize);
+
+  Kokkos::parallel_for(
+    "RHS", Kokkos::RangePolicy<ExecSpace>(0, nbatches),
+    KOKKOS_LAMBDA(const SizeType i) {
+      auto u = y2d(i,0);
+      auto v = y2d(i,1);
+      auto w = y2d(i,2);
+      ydot2d(i,0) = a - (w + ONE) * u + v * u * u;
+      ydot2d(i,1) = w * u - v * u * u;
+      ydot2d(i,2) = (b - w) / ep - w * u;
+    });
+
+  return 0;
+}
+
+// Jacobian of f(t,y)
+int Jac(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data,
+        N_Vector tmp1, N_Vector tmp2, N_Vector tmp3)
+{
+  auto udata  = static_cast<UserData*>(user_data);
+  auto y_data = sundials::kokkos::GetVec<VecType>(y)->View();
+  auto J_data = sundials::kokkos::GetDenseMat<MatType>(J)->View();
+
+  const auto nbatches  = udata->nbatches;
+  const auto batchSize = udata->batchSize;
+
+  const auto ep = udata->ep;
+  Vec2D y2d(N_VGetDeviceArrayPointer(y), nbatches, batchSize);
+
+  Kokkos::parallel_for(
+    "Jac", Kokkos::RangePolicy<ExecSpace>(0, nbatches),
+    KOKKOS_LAMBDA(const SizeType i) {
+      // get y values
+      auto u = y2d(i,0);
+      auto v = y2d(i,1);
+      auto w = y2d(i,2);
+
+      // first col of block
+      J_data(i, 0, 0) = -(w + ONE) + TWO * u * v;
+      J_data(i, 1, 0) = u * u;
+      J_data(i, 2, 0) = -u;
+
+      // second col of block
+      J_data(i, 0, 1) = u * u;
+      J_data(i, 1, 1) = -u * u;
+      J_data(i, 2, 1) = u;
+
+      // third col of block
+      J_data(i, 0, 2) = -w;
+      J_data(i, 1, 2) = ZERO;
+      J_data(i, 2, 2) = -ONE / ep - u;
+    });
+
+  return 0;
+}
diff --git a/examples/nvector/sycl/test_nvector_sycl.cpp b/examples/nvector/sycl/test_nvector_sycl.cpp
index 4ce143451c..b3b6f5c8dc 100644
--- a/examples/nvector/sycl/test_nvector_sycl.cpp
+++ b/examples/nvector/sycl/test_nvector_sycl.cpp
@@ -69,7 +69,7 @@ int main(int argc, char *argv[])
   SetTiming(print_timing, 0);
 
   /* Create an in-order GPU queue */
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else
diff --git a/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp b/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp
index 6c63f71e6e..c3adc0a08e 100644
--- a/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp
+++ b/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp
@@ -71,7 +71,7 @@ int main(int argc, char *argv[])
          (long int) cols, (long int) nblocks);
 
   // Create an in-order GPU queue
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else
diff --git a/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp b/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp
index 75d11de689..615e1ee9dd 100644
--- a/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp
+++ b/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp
@@ -80,7 +80,7 @@ int main(int argc, char *argv[])
          (long int) matrows, (long int) matcols);
 
   // Create an in-order GPU queue
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else
diff --git a/include/arkode/arkode_arkstep.h b/include/arkode/arkode_arkstep.h
index 7d624635c5..d293f7611c 100644
--- a/include/arkode/arkode_arkstep.h
+++ b/include/arkode/arkode_arkstep.h
@@ -50,9 +50,11 @@ static const int ARKSTEP_DEFAULT_DIRK_4 = ARKODE_SDIRK_5_3_4;
 static const int ARKSTEP_DEFAULT_DIRK_5 = ARKODE_ARK548L2SA_DIRK_8_4_5;
 
 /*    ImEx */
+static const int ARKSTEP_DEFAULT_ARK_ETABLE_2 = ARKODE_ARK2_ERK_3_1_2;
 static const int ARKSTEP_DEFAULT_ARK_ETABLE_3 = ARKODE_ARK324L2SA_ERK_4_2_3;
 static const int ARKSTEP_DEFAULT_ARK_ETABLE_4 = ARKODE_ARK436L2SA_ERK_6_3_4;
 static const int ARKSTEP_DEFAULT_ARK_ETABLE_5 = ARKODE_ARK548L2SA_ERK_8_4_5;
+static const int ARKSTEP_DEFAULT_ARK_ITABLE_2 = ARKODE_ARK2_DIRK_3_1_2;
 static const int ARKSTEP_DEFAULT_ARK_ITABLE_3 = ARKODE_ARK324L2SA_DIRK_4_2_3;
 static const int ARKSTEP_DEFAULT_ARK_ITABLE_4 = ARKODE_ARK436L2SA_DIRK_6_3_4;
 static const int ARKSTEP_DEFAULT_ARK_ITABLE_5 = ARKODE_ARK548L2SA_DIRK_8_4_5;
@@ -238,6 +240,8 @@ SUNDIALS_EXPORT int ARKStepSetMinStep(void *arkode_mem,
                                       realtype hmin);
 SUNDIALS_EXPORT int ARKStepSetMaxStep(void *arkode_mem,
                                       realtype hmax);
+SUNDIALS_EXPORT int ARKStepSetInterpolateStopTime(void *arkode_mem,
+                                                  booleantype interp);
 SUNDIALS_EXPORT int ARKStepSetStopTime(void *arkode_mem,
                                        realtype tstop);
 SUNDIALS_EXPORT int ARKStepClearStopTime(void *arkode_mem);
diff --git a/include/arkode/arkode_butcher_dirk.h b/include/arkode/arkode_butcher_dirk.h
index 76f1d1cb7d..aecaf16a82 100644
--- a/include/arkode/arkode_butcher_dirk.h
+++ b/include/arkode/arkode_butcher_dirk.h
@@ -92,7 +92,8 @@ typedef enum {
   ARKODE_ESDIRK437L2SA_7_3_4,
   ARKODE_ESDIRK547L2SA_7_4_5,
   ARKODE_ESDIRK547L2SA2_7_4_5,
-  ARKODE_MAX_DIRK_NUM = ARKODE_ESDIRK547L2SA2_7_4_5
+  ARKODE_ARK2_DIRK_3_1_2,
+  ARKODE_MAX_DIRK_NUM = ARKODE_ARK2_DIRK_3_1_2
 } ARKODE_DIRKTableID;
 
 /* Accessor routine to load built-in DIRK table */
diff --git a/include/arkode/arkode_butcher_erk.h b/include/arkode/arkode_butcher_erk.h
index acd1d613fb..6673acb119 100644
--- a/include/arkode/arkode_butcher_erk.h
+++ b/include/arkode/arkode_butcher_erk.h
@@ -84,7 +84,8 @@ typedef enum {
   ARKODE_KNOTH_WOLKE_3_3,
   ARKODE_ARK437L2SA_ERK_7_3_4,
   ARKODE_ARK548L2SAb_ERK_8_4_5,
-  ARKODE_MAX_ERK_NUM = ARKODE_ARK548L2SAb_ERK_8_4_5
+  ARKODE_ARK2_ERK_3_1_2,
+  ARKODE_MAX_ERK_NUM = ARKODE_ARK2_ERK_3_1_2
 } ARKODE_ERKTableID;
 
 /* Accessor routine to load built-in ERK table */
diff --git a/include/arkode/arkode_erkstep.h b/include/arkode/arkode_erkstep.h
index 7c877a4de3..a9f6f2d13e 100644
--- a/include/arkode/arkode_erkstep.h
+++ b/include/arkode/arkode_erkstep.h
@@ -153,6 +153,8 @@ SUNDIALS_EXPORT int ERKStepSetMinStep(void *arkode_mem,
                                       realtype hmin);
 SUNDIALS_EXPORT int ERKStepSetMaxStep(void *arkode_mem,
                                       realtype hmax);
+SUNDIALS_EXPORT int ERKStepSetInterpolateStopTime(void *arkode_mem,
+                                                  booleantype interp);
 SUNDIALS_EXPORT int ERKStepSetStopTime(void *arkode_mem,
                                        realtype tstop);
 SUNDIALS_EXPORT int ERKStepClearStopTime(void *arkode_mem);
diff --git a/include/arkode/arkode_mristep.h b/include/arkode/arkode_mristep.h
index 6d91f74dc8..d4d889813f 100644
--- a/include/arkode/arkode_mristep.h
+++ b/include/arkode/arkode_mristep.h
@@ -253,6 +253,8 @@ SUNDIALS_EXPORT int MRIStepSetMaxHnilWarns(void *arkode_mem,
                                            int mxhnil);
 SUNDIALS_EXPORT int MRIStepSetStopTime(void *arkode_mem,
                                        realtype tstop);
+SUNDIALS_EXPORT int MRIStepSetInterpolateStopTime(void *arkode_mem,
+                                                  booleantype interp);
 SUNDIALS_EXPORT int MRIStepClearStopTime(void *arkode_mem);
 SUNDIALS_EXPORT int MRIStepSetFixedStep(void *arkode_mem,
                                         realtype hsfixed);
diff --git a/include/cvode/cvode.h b/include/cvode/cvode.h
index db21479fe8..35647f983e 100644
--- a/include/cvode/cvode.h
+++ b/include/cvode/cvode.h
@@ -147,6 +147,7 @@ SUNDIALS_EXPORT int CVodeSetNonlinConvCoef(void *cvode_mem, realtype nlscoef);
 SUNDIALS_EXPORT int CVodeSetNonlinearSolver(void *cvode_mem, SUNNonlinearSolver NLS);
 SUNDIALS_EXPORT int CVodeSetStabLimDet(void *cvode_mem, booleantype stldet);
 SUNDIALS_EXPORT int CVodeSetStopTime(void *cvode_mem, realtype tstop);
+SUNDIALS_EXPORT int CVodeSetInterpolateStopTime(void *cvode_mem, booleantype interp);
 SUNDIALS_EXPORT int CVodeClearStopTime(void *cvode_mem);
 SUNDIALS_EXPORT int CVodeSetUseIntegratorFusedKernels(void *cvode_mem, booleantype onoff);
 SUNDIALS_EXPORT int CVodeSetUserData(void *cvode_mem, void *user_data);
diff --git a/include/cvodes/cvodes.h b/include/cvodes/cvodes.h
index 3dbdad7900..b1a7892377 100644
--- a/include/cvodes/cvodes.h
+++ b/include/cvodes/cvodes.h
@@ -222,6 +222,7 @@ SUNDIALS_EXPORT int CVodeSetNonlinConvCoef(void *cvode_mem, realtype nlscoef);
 SUNDIALS_EXPORT int CVodeSetNonlinearSolver(void *cvode_mem, SUNNonlinearSolver NLS);
 SUNDIALS_EXPORT int CVodeSetStabLimDet(void *cvode_mem, booleantype stldet);
 SUNDIALS_EXPORT int CVodeSetStopTime(void *cvode_mem, realtype tstop);
+SUNDIALS_EXPORT int CVodeSetInterpolateStopTime(void *cvode_mem, booleantype interp);
 SUNDIALS_EXPORT int CVodeClearStopTime(void *cvode_mem);
 SUNDIALS_EXPORT int CVodeSetUserData(void *cvode_mem, void *user_data);
 
diff --git a/include/nvector/nvector_kokkos.hpp b/include/nvector/nvector_kokkos.hpp
index 1b424a7d44..269d15dcef 100644
--- a/include/nvector/nvector_kokkos.hpp
+++ b/include/nvector/nvector_kokkos.hpp
@@ -650,6 +650,20 @@ void CopyFromDevice(VectorType& v)
   Kokkos::deep_copy(v.HostView(), v.View());
 }
 
+template<class VectorType, class view_type>
+view_type GetView(N_Vector v)
+{
+  auto vec{GetVec<VectorType>(v)};
+  return vec->View();
+}
+
+template<class VectorType, class host_view_type>
+host_view_type GetHostView(N_Vector v)
+{
+  auto vec{GetVec<VectorType>(v)};
+  return vec->HostView();
+}
+
 } // namespace kokkos
 } // namespace sundials
 
diff --git a/include/sundials/sundials_config.in b/include/sundials/sundials_config.in
index 545c1aebe8..f3f66ff89c 100644
--- a/include/sundials/sundials_config.in
+++ b/include/sundials/sundials_config.in
@@ -130,12 +130,16 @@
  */
 #cmakedefine01 SUNDIALS_MPI_ENABLED
 
- /* SUPERLUMT threading type */
-#cmakedefine SUNDIALS_SUPERLUMT_THREAD_TYPE "@SUPERLUMT_THREAD_TYPE@"
+/* oneMKL interface options */
+#cmakedefine SUNDIALS_ONEMKL_USE_GETRF_LOOP
+#cmakedefine SUNDIALS_ONEMKL_USE_GETRS_LOOP
 
- /* Trilinos with MPI is available, then
-  *    #define SUNDIALS_TRILINOS_HAVE_MPI
-  */
+/* SUPERLUMT threading type */
+#define SUNDIALS_SUPERLUMT_THREAD_TYPE "@SUPERLUMT_THREAD_TYPE@"
+
+/* Trilinos with MPI is available, then
+ *    #define SUNDIALS_TRILINOS_HAVE_MPI
+ */
 #cmakedefine SUNDIALS_TRILINOS_HAVE_MPI
 
 /* RAJA backends */
@@ -143,6 +147,10 @@
 #cmakedefine SUNDIALS_RAJA_BACKENDS_HIP
 #cmakedefine SUNDIALS_RAJA_BACKENDS_SYCL
 
+/* SYCL options */
+#cmakedefine SUNDIALS_SYCL_2020_UNSUPPORTED
+
+
 /* ------------------------------------------------------------------
  * SUNDIALS modules enabled
  * -----------------------------------------------------------------*/
diff --git a/scripts/cvode b/scripts/cvode
index 57261482ba..af97c77f74 100755
--- a/scripts/cvode
+++ b/scripts/cvode
@@ -221,6 +221,10 @@ $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.cpp
 $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.CUDA.out
 $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.OPENMP.out
 $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.SERIAL.out
+$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp
+$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out
+$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out
+$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out
 
 $tar $tarfile $distrobase/examples/cvode/magma/README
 $tar $tarfile $distrobase/examples/cvode/magma/CMakeLists.txt
diff --git a/src/arkode/arkode.c b/src/arkode/arkode.c
index 9b6f7e9e5b..744abfed2a 100644
--- a/src/arkode/arkode.c
+++ b/src/arkode/arkode.c
@@ -962,7 +962,11 @@ int arkEvolve(ARKodeMem ark_mem, realtype tout, N_Vector yout,
       troundoff = FUZZ_FACTOR*ark_mem->uround *
         (SUNRabs(ark_mem->tcur) + SUNRabs(ark_mem->h));
       if ( SUNRabs(ark_mem->tcur - ark_mem->tstop) <= troundoff) {
-        (void) arkGetDky(ark_mem, ark_mem->tstop, 0, yout);
+        if (ark_mem->tstopinterp) {
+          (void) arkGetDky(ark_mem, ark_mem->tstop, 0, yout);
+        } else {
+          N_VScale(ONE, ark_mem->yn, yout);
+        }
         ark_mem->tretlast = *tret = ark_mem->tstop;
         ark_mem->tstopset = SUNFALSE;
         istate = ARK_TSTOP_RETURN;
@@ -1356,6 +1360,7 @@ void arkPrintMem(ARKodeMem ark_mem, FILE *outfile)
   fprintf(outfile, "liw = %li\n", (long int) ark_mem->liw);
   fprintf(outfile, "user_efun = %i\n", ark_mem->user_efun);
   fprintf(outfile, "tstopset = %i\n", ark_mem->tstopset);
+  fprintf(outfile, "tstopinterp = %i\n", ark_mem->tstopinterp);
   fprintf(outfile, "tstop = %" RSYM"\n", ark_mem->tstop);
   fprintf(outfile, "report = %i\n", ark_mem->report);
   fprintf(outfile, "VabstolMallocDone = %i\n", ark_mem->VabstolMallocDone);
diff --git a/src/arkode/arkode_arkstep.c b/src/arkode/arkode_arkstep.c
index ef38649f90..8b5bde3a1b 100644
--- a/src/arkode/arkode_arkstep.c
+++ b/src/arkode/arkode_arkstep.c
@@ -1924,6 +1924,9 @@ int arkStep_SetButcherTables(ARKodeMem ark_mem)
     switch (step_mem->q) {
 
     case(2):
+      etable = ARKSTEP_DEFAULT_ARK_ETABLE_2;
+      itable = ARKSTEP_DEFAULT_ARK_ITABLE_2;
+      break;
     case(3):
       etable = ARKSTEP_DEFAULT_ARK_ETABLE_3;
       itable = ARKSTEP_DEFAULT_ARK_ITABLE_3;
diff --git a/src/arkode/arkode_arkstep_io.c b/src/arkode/arkode_arkstep_io.c
index bfa713fda9..93c99e6014 100644
--- a/src/arkode/arkode_arkstep_io.c
+++ b/src/arkode/arkode_arkstep_io.c
@@ -59,6 +59,9 @@ int ARKStepSetMaxStep(void *arkode_mem, realtype hmax) {
   return(arkSetMaxStep(arkode_mem, hmax)); }
 int ARKStepSetStopTime(void *arkode_mem, realtype tstop) {
   return(arkSetStopTime(arkode_mem, tstop)); }
+int ARKStepSetInterpolateStopTime(void *arkode_mem,
+                                  booleantype interp) {
+  return(arkSetInterpolateStopTime(arkode_mem, interp)); }
 int ARKStepClearStopTime(void *arkode_mem) {
   return(arkClearStopTime(arkode_mem)); }
 int ARKStepSetRootDirection(void *arkode_mem, int *rootdir) {
diff --git a/src/arkode/arkode_butcher_dirk.c b/src/arkode/arkode_butcher_dirk.c
index d1346acdd4..b74a543a52 100644
--- a/src/arkode/arkode_butcher_dirk.c
+++ b/src/arkode/arkode_butcher_dirk.c
@@ -66,8 +66,8 @@ ARKODE_DIRKTableID arkButcherTableDIRKNameToID(const char *imethod) {
 #undef ARK_BUTCHER_TABLE
 
   arkProcessError(NULL, ARK_ILL_INPUT, "ARKODE",
-	   "arkButcherTableDIRKNameToID",
-	   "Unknown Butcher table");
+                  "arkButcherTableDIRKNameToID",
+                  "Unknown Butcher table");
 
   return ARKODE_DIRK_NONE;
 }
diff --git a/src/arkode/arkode_butcher_dirk.def b/src/arkode/arkode_butcher_dirk.def
index f9e63023ab..a3cca75f66 100644
--- a/src/arkode/arkode_butcher_dirk.def
+++ b/src/arkode/arkode_butcher_dirk.def
@@ -57,7 +57,8 @@
      ARKODE_ARK548L2SAb_DIRK_8_4_5*   ESDIRK     Y         Y       N
      ARKODE_ESDIRK547L2SA_7_4_5       ESDIRK     Y         Y       N
      ARKODE_ESDIRK547L2SA2_7_4_5      ESDIRK     Y         Y       N
-    -----------------------------------------------------------------
+     ARKODE_ARK2_DIRK_3_1_2           ESDIRK     Y         Y       Y
+     -----------------------------------------------------------------
 */
 
 ARK_BUTCHER_TABLE(ARKODE_DIRK_NONE, {
@@ -68,22 +69,56 @@ ARK_BUTCHER_TABLE(ARKODE_SDIRK_2_1_2, { /* SDIRK-2-1 (A,B stable) */
     ARKodeButcherTable B = ARKodeButcherTable_Alloc(2, SUNTRUE);
     B->q = 2;
     B->p = 1;
-    
+
     B->A[0][0] = RCONST(1.0);
     B->A[1][0] = RCONST(-1.0);
     B->A[1][1] = RCONST(1.0);
-    
+
     B->b[0] = RCONST(0.5);
     B->b[1] = RCONST(0.5);
-    
+
     B->d[0] = RCONST(1.0);
 
-    
     B->c[0] = RCONST(1.0);
     B->c[1] = RCONST(0.0);
     return B;
   })
 
+ARK_BUTCHER_TABLE(ARKODE_ARK2_DIRK_3_1_2, { /* ARK2 Implicit Table (A,L stable) */
+    ARKodeButcherTable B = ARKodeButcherTable_Alloc(3, SUNTRUE);
+
+    /* 1 - 1 / sqrt(2) */
+    const sunrealtype gamma = SUN_RCONST(1.0) - SUN_RCONST(1.0) / SUNRsqrt(SUN_RCONST(2.0));
+    /* 1 / (2 sqrt(2)) */
+    const sunrealtype delta = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0)));
+    /* 2 - sqrt(2) */
+    const sunrealtype twogamma = SUN_RCONST(2.0) - SUNRsqrt(SUN_RCONST(2.0));
+    /* (4 - sqrt(2)) / 8 */
+    const sunrealtype beta = (SUN_RCONST(4.0) - SUNRsqrt(SUN_RCONST(2.0))) / SUN_RCONST(8.0);
+
+    B->q = 2;
+    B->p = 1;
+
+    B->A[1][0] = gamma;
+    B->A[1][1] = gamma;
+    B->A[2][0] = delta;
+    B->A[2][1] = delta;
+    B->A[2][2] = gamma;
+
+    B->b[0] = delta;
+    B->b[1] = delta;
+    B->b[2] = gamma;
+
+    B->d[0] = beta;
+    B->d[1] = beta;
+    B->d[2] = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0)));
+
+    B->c[1] = twogamma;
+    B->c[2] = SUN_RCONST(1.0);
+
+    return B;
+  })
+
 ARK_BUTCHER_TABLE(ARKODE_BILLINGTON_3_3_2, { /* Billington-SDIRK */
     ARKodeButcherTable B = ARKodeButcherTable_Alloc(3, SUNTRUE);
 
@@ -620,7 +655,7 @@ ARK_BUTCHER_TABLE(ARKODE_ESDIRK324L2SA_4_2_3, { /* ESDIRK3(2)4L[2]SA (A,L stable
     const sunrealtype g4 = g3 * g;
     const sunrealtype g5 = g4 * g;
     const sunrealtype c3 = RCONST(0.6);
-    
+
     ARKodeButcherTable B = ARKodeButcherTable_Alloc(4, SUNTRUE);
     B->q = 3;
     B->p = 2;
diff --git a/src/arkode/arkode_butcher_erk.def b/src/arkode/arkode_butcher_erk.def
index 43a8fe7210..f25fafb830 100644
--- a/src/arkode/arkode_butcher_erk.def
+++ b/src/arkode/arkode_butcher_erk.def
@@ -52,6 +52,7 @@
      ARKODE_ARK548L2SAb_ERK_8_4_5*       N
      ARKODE_VERNER_8_5_6                 Y
      ARKODE_FEHLBERG_13_7_8              Y
+     ARKODE_ARK2_ERK_3_1_2               Y
     --------------------------------
      ARKODE_KNOTH_WOLKE_3_3^             Y
     --------------------------------
@@ -75,7 +76,42 @@ ARK_BUTCHER_TABLE(ARKODE_HEUN_EULER_2_1_2, { /* Heun-Euler-ERK */
 
     B->c[1] = RCONST(1.0);
     return B;
- }) 
+ })
+
+ARK_BUTCHER_TABLE(ARKODE_ARK2_ERK_3_1_2, { /* ARK2 Explicit Table */
+    ARKodeButcherTable B = ARKodeButcherTable_Alloc(3, SUNTRUE);
+
+    /* 1 - 1 / sqrt(2) */
+    const sunrealtype gamma = SUN_RCONST(1.0) - SUN_RCONST(1.0) / SUNRsqrt(SUN_RCONST(2.0));
+    /* (3 + 2 sqrt(2)) / 6 */
+    const sunrealtype alpha = (SUN_RCONST(3.0) + SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0))) / SUN_RCONST(6.0);
+    /* 1 / (2 sqrt(2)) */
+    const sunrealtype delta = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0)));
+    /* 2 - sqrt(2) */
+    const sunrealtype twogamma = SUN_RCONST(2.0) - SUNRsqrt(SUN_RCONST(2.0));
+    /* (4 - sqrt(2)) / 8 */
+    const sunrealtype beta = (SUN_RCONST(4.0) - SUNRsqrt(SUN_RCONST(2.0))) / SUN_RCONST(8.0);
+
+    B->q = 2;
+    B->p = 1;
+
+    B->A[1][0] = twogamma;
+    B->A[2][0] = SUN_RCONST(1.0) - alpha;
+    B->A[2][1] = alpha;
+
+    B->b[0] = delta;
+    B->b[1] = delta;
+    B->b[2] = gamma;
+
+    B->d[0] = beta;
+    B->d[1] = beta;
+    B->d[2] = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0)));
+
+    B->c[1] = twogamma;
+    B->c[2] = SUN_RCONST(1.0);
+
+    return B;
+  })
 
 ARK_BUTCHER_TABLE(ARKODE_BOGACKI_SHAMPINE_4_2_3, { /* Bogacki-Shampine-ERK */
     ARKodeButcherTable B = ARKodeButcherTable_Alloc(4, SUNTRUE);
@@ -684,4 +720,3 @@ ARK_BUTCHER_TABLE(ARKODE_KNOTH_WOLKE_3_3, { /* Knoth-Wolke-ERK */
     B->c[2] = RCONST(3.0)/RCONST(4.0);
     return B;
   })
-
diff --git a/src/arkode/arkode_erkstep_io.c b/src/arkode/arkode_erkstep_io.c
index fbce038dc3..d6cdb2fa35 100644
--- a/src/arkode/arkode_erkstep_io.c
+++ b/src/arkode/arkode_erkstep_io.c
@@ -59,6 +59,9 @@ int ERKStepSetMaxStep(void *arkode_mem, realtype hmax) {
   return(arkSetMaxStep(arkode_mem, hmax)); }
 int ERKStepSetStopTime(void *arkode_mem, realtype tstop) {
   return(arkSetStopTime(arkode_mem, tstop)); }
+int ERKStepSetInterpolateStopTime(void *arkode_mem,
+                                  booleantype interp) {
+  return(arkSetInterpolateStopTime(arkode_mem, interp)); }
 int ERKStepClearStopTime(void *arkode_mem) {
   return(arkClearStopTime(arkode_mem)); }
 int ERKStepSetRootDirection(void *arkode_mem, int *rootdir) {
diff --git a/src/arkode/arkode_impl.h b/src/arkode/arkode_impl.h
index f099af8444..2b2350d64f 100644
--- a/src/arkode/arkode_impl.h
+++ b/src/arkode/arkode_impl.h
@@ -336,6 +336,7 @@ typedef struct ARKodeMemRec {
 
   /* Tstop information */
   booleantype tstopset;
+  booleantype tstopinterp;
   realtype    tstop;
 
   /* Time step data */
@@ -975,6 +976,7 @@ int arkSetInitStep(void *arkode_mem, realtype hin);
 int arkSetMinStep(void *arkode_mem, realtype hmin);
 int arkSetMaxStep(void *arkode_mem, realtype hmax);
 int arkSetStopTime(void *arkode_mem, realtype tstop);
+int arkSetInterpolateStopTime(void *arkode_mem, booleantype interp);
 int arkClearStopTime(void *arkode_mem);
 int arkSetFixedStep(void *arkode_mem, realtype hfixed);
 int arkSetRootDirection(void *arkode_mem, int *rootdir);
diff --git a/src/arkode/arkode_io.c b/src/arkode/arkode_io.c
index aadc4044f4..b4fda7d21c 100644
--- a/src/arkode/arkode_io.c
+++ b/src/arkode/arkode_io.c
@@ -80,6 +80,7 @@ int arkSetDefaults(void *arkode_mem)
   ark_mem->hmin                    = ZERO;           /* no minimum step size */
   ark_mem->hmax_inv                = ZERO;           /* no maximum step size */
   ark_mem->tstopset                = SUNFALSE;       /* no stop time set */
+  ark_mem->tstopinterp             = SUNFALSE;       /* copy at stop time */
   ark_mem->tstop                   = ZERO;           /* no fixed stop time */
   ark_mem->diagfp                  = NULL;           /* no solver diagnostics file */
   ark_mem->report                  = SUNFALSE;       /* don't report solver diagnostics */
@@ -518,6 +519,26 @@ int arkSetStopTime(void *arkode_mem, realtype tstop)
 }
 
 
+/*---------------------------------------------------------------
+  arkSetInterpolateStopTime:
+
+  Specifies to use interpolation to fill the solution output at
+  the stop time (instead of a copy).
+  ---------------------------------------------------------------*/
+int arkSetInterpolateStopTime(void *arkode_mem, booleantype interp)
+{
+  ARKodeMem ark_mem;
+  if (arkode_mem==NULL) {
+    arkProcessError(NULL, ARK_MEM_NULL, "ARKODE",
+                    "arkSetInterpolateStopTime", MSG_ARK_NO_MEM);
+    return (ARK_MEM_NULL);
+  }
+  ark_mem = (ARKodeMem) arkode_mem;
+  ark_mem->tstopinterp = interp;
+  return(ARK_SUCCESS);
+}
+
+
 /*---------------------------------------------------------------
   arkClearStopTime:
 
diff --git a/src/arkode/arkode_mristep_io.c b/src/arkode/arkode_mristep_io.c
index c722cfd26a..5c6ced636e 100644
--- a/src/arkode/arkode_mristep_io.c
+++ b/src/arkode/arkode_mristep_io.c
@@ -48,6 +48,9 @@ int MRIStepSetMaxHnilWarns(void *arkode_mem, int mxhnil) {
   return(arkSetMaxHnilWarns(arkode_mem, mxhnil)); }
 int MRIStepSetStopTime(void *arkode_mem, realtype tstop) {
   return(arkSetStopTime(arkode_mem, tstop)); }
+int MRIStepSetInterpolateStopTime(void *arkode_mem,
+                                  booleantype interp) {
+  return(arkSetInterpolateStopTime(arkode_mem, interp)); }
 int MRIStepClearStopTime(void *arkode_mem) {
   return(arkClearStopTime(arkode_mem)); }
 int MRIStepSetRootDirection(void *arkode_mem, int *rootdir) {
diff --git a/src/cvode/cvode.c b/src/cvode/cvode.c
index 609b55a52f..e2a56c3850 100644
--- a/src/cvode/cvode.c
+++ b/src/cvode/cvode.c
@@ -323,6 +323,7 @@ void *CVodeCreate(int lmm, SUNContext sunctx)
   cv_mem->cv_small_nst        = SMALL_NST_DEFAULT;
   cv_mem->cv_small_nef        = SMALL_NEF_DEFAULT;
   cv_mem->cv_tstopset         = SUNFALSE;
+  cv_mem->cv_tstopinterp      = SUNFALSE;
   cv_mem->cv_maxnef           = MXNEF;
   cv_mem->cv_maxncf           = MXNCF;
   cv_mem->cv_nlscoef          = CORTES;
@@ -1258,12 +1259,16 @@ int CVode(void *cvode_mem, realtype tout, N_Vector yout,
     if ( cv_mem->cv_tstopset ) {
 
       if ( SUNRabs(cv_mem->cv_tn - cv_mem->cv_tstop) <= troundoff ) {
-        ier =  CVodeGetDky(cv_mem, cv_mem->cv_tstop, 0, yout);
-        if (ier != CV_SUCCESS) {
-          cvProcessError(cv_mem, CV_ILL_INPUT, "CVODE", "CVode",
-                         MSGCV_BAD_TSTOP, cv_mem->cv_tstop, cv_mem->cv_tn);
-          SUNDIALS_MARK_FUNCTION_END(CV_PROFILER);
-          return(CV_ILL_INPUT);
+        if (cv_mem->cv_tstopinterp) {
+          ier =  CVodeGetDky(cv_mem, cv_mem->cv_tstop, 0, yout);
+          if (ier != CV_SUCCESS) {
+            cvProcessError(cv_mem, CV_ILL_INPUT, "CVODE", "CVode",
+                           MSGCV_BAD_TSTOP, cv_mem->cv_tstop, cv_mem->cv_tn);
+            SUNDIALS_MARK_FUNCTION_END(CV_PROFILER);
+            return(CV_ILL_INPUT);
+          }
+        } else {
+          N_VScale(ONE, cv_mem->cv_zn[0], yout);
         }
         cv_mem->cv_tretlast = *tret = cv_mem->cv_tstop;
         cv_mem->cv_tstopset = SUNFALSE;
@@ -1434,7 +1439,11 @@ int CVode(void *cvode_mem, realtype tout, N_Vector yout,
       troundoff = FUZZ_FACTOR * cv_mem->cv_uround *
         (SUNRabs(cv_mem->cv_tn) + SUNRabs(cv_mem->cv_h));
       if ( SUNRabs(cv_mem->cv_tn - cv_mem->cv_tstop) <= troundoff) {
-        (void) CVodeGetDky(cv_mem, cv_mem->cv_tstop, 0, yout);
+        if (cv_mem->cv_tstopinterp) {
+          (void) CVodeGetDky(cv_mem, cv_mem->cv_tstop, 0, yout);
+        } else {
+          N_VScale(ONE, cv_mem->cv_zn[0], yout);
+        }
         cv_mem->cv_tretlast = *tret = cv_mem->cv_tstop;
         cv_mem->cv_tstopset = SUNFALSE;
         istate = CV_TSTOP_RETURN;
diff --git a/src/cvode/cvode_impl.h b/src/cvode/cvode_impl.h
index e5a65843a7..966626b715 100644
--- a/src/cvode/cvode_impl.h
+++ b/src/cvode/cvode_impl.h
@@ -256,6 +256,7 @@ typedef struct CVodeMemRec {
     -----------------*/
 
   booleantype cv_tstopset;
+  booleantype cv_tstopinterp;
   realtype cv_tstop;
 
   /*---------
diff --git a/src/cvode/cvode_io.c b/src/cvode/cvode_io.c
index ce947d6a2c..174462b552 100644
--- a/src/cvode/cvode_io.c
+++ b/src/cvode/cvode_io.c
@@ -714,6 +714,27 @@ int CVodeSetStopTime(void *cvode_mem, realtype tstop)
   return(CV_SUCCESS);
 }
 
+/*
+ * CVodeSetInterpolateStopTime
+ *
+ * Specifies to use interpolation to fill the output solution at
+ * the stop time (instead of a copy).
+ */
+
+int CVodeSetInterpolateStopTime(void *cvode_mem, booleantype interp)
+{
+  CVodeMem cv_mem;
+
+  if (cvode_mem==NULL) {
+    cvProcessError(NULL, CV_MEM_NULL, "CVODE", "CVodeSetInterpolateStopTime", MSGCV_NO_MEM);
+    return (CV_MEM_NULL);
+  }
+  cv_mem = (CVodeMem) cvode_mem;
+  cv_mem->cv_tstopinterp = interp;
+
+  return(CV_SUCCESS);
+}
+
 /*
  * CVodeClearStopTime
  *
diff --git a/src/cvode/cvode_ls.c b/src/cvode/cvode_ls.c
index a32679ab3c..a9d986c193 100644
--- a/src/cvode/cvode_ls.c
+++ b/src/cvode/cvode_ls.c
@@ -1636,7 +1636,8 @@ int cvLsSolve(CVodeMem cv_mem, N_Vector b, N_Vector weight,
               N_Vector ynow, N_Vector fnow)
 {
   CVLsMem  cvls_mem;
-  realtype bnorm, deltar, delta, w_mean;
+  realtype bnorm = ZERO;
+  realtype deltar, delta, w_mean;
   int      curiter, nli_inc, retval;
 #if SUNDIALS_LOGGING_LEVEL >= SUNDIALS_LOGGING_DEBUG
   realtype resnorm;
diff --git a/src/cvodes/cvodes.c b/src/cvodes/cvodes.c
index edf454f720..87538bfa95 100644
--- a/src/cvodes/cvodes.c
+++ b/src/cvodes/cvodes.c
@@ -533,6 +533,7 @@ void *CVodeCreate(int lmm, SUNContext sunctx)
   cv_mem->cv_small_nst        = SMALL_NST_DEFAULT;
   cv_mem->cv_small_nef        = SMALL_NEF_DEFAULT;
   cv_mem->cv_tstopset         = SUNFALSE;
+  cv_mem->cv_tstopinterp      = SUNFALSE;
   cv_mem->cv_maxnef           = MXNEF;
   cv_mem->cv_maxncf           = MXNCF;
   cv_mem->cv_nlscoef          = CORTES;
@@ -3084,12 +3085,16 @@ int CVode(void *cvode_mem, realtype tout, N_Vector yout,
     if ( cv_mem->cv_tstopset ) {
 
       if ( SUNRabs(cv_mem->cv_tn - cv_mem->cv_tstop) <= troundoff ) {
-        ier =  CVodeGetDky(cv_mem, cv_mem->cv_tstop, 0, yout);
-        if (ier != CV_SUCCESS) {
-          cvProcessError(cv_mem, CV_ILL_INPUT, "CVODES", "CVode",
-                         MSGCV_BAD_TSTOP, cv_mem->cv_tstop, cv_mem->cv_tn);
-          SUNDIALS_MARK_FUNCTION_END(CV_PROFILER);
-          return(CV_ILL_INPUT);
+        if (cv_mem->cv_tstopinterp) {
+          ier =  CVodeGetDky(cv_mem, cv_mem->cv_tstop, 0, yout);
+          if (ier != CV_SUCCESS) {
+            cvProcessError(cv_mem, CV_ILL_INPUT, "CVODES", "CVode",
+                           MSGCV_BAD_TSTOP, cv_mem->cv_tstop, cv_mem->cv_tn);
+            SUNDIALS_MARK_FUNCTION_END(CV_PROFILER);
+            return(CV_ILL_INPUT);
+          }
+        } else {
+          N_VScale(ONE, cv_mem->cv_zn[0], yout);
         }
         cv_mem->cv_tretlast = *tret = cv_mem->cv_tstop;
         cv_mem->cv_tstopset = SUNFALSE;
@@ -3304,7 +3309,11 @@ int CVode(void *cvode_mem, realtype tout, N_Vector yout,
       troundoff = FUZZ_FACTOR * cv_mem->cv_uround *
         (SUNRabs(cv_mem->cv_tn) + SUNRabs(cv_mem->cv_h));
       if ( SUNRabs(cv_mem->cv_tn - cv_mem->cv_tstop) <= troundoff) {
-        (void) CVodeGetDky(cv_mem, cv_mem->cv_tstop, 0, yout);
+        if (cv_mem->cv_tstopinterp) {
+          (void) CVodeGetDky(cv_mem, cv_mem->cv_tstop, 0, yout);
+        } else {
+          N_VScale(ONE, cv_mem->cv_zn[0], yout);
+        }
         cv_mem->cv_tretlast = *tret = cv_mem->cv_tstop;
         cv_mem->cv_tstopset = SUNFALSE;
         istate = CV_TSTOP_RETURN;
diff --git a/src/cvodes/cvodes_impl.h b/src/cvodes/cvodes_impl.h
index c1ac992ebf..b4a18676a8 100644
--- a/src/cvodes/cvodes_impl.h
+++ b/src/cvodes/cvodes_impl.h
@@ -384,6 +384,7 @@ typedef struct CVodeMemRec {
     -----------------*/
 
   booleantype cv_tstopset;
+  booleantype cv_tstopinterp;
   realtype cv_tstop;
 
   /*---------
diff --git a/src/cvodes/cvodes_io.c b/src/cvodes/cvodes_io.c
index 1eeea64470..93525e3477 100644
--- a/src/cvodes/cvodes_io.c
+++ b/src/cvodes/cvodes_io.c
@@ -716,6 +716,25 @@ int CVodeSetStopTime(void *cvode_mem, realtype tstop)
   return(CV_SUCCESS);
 }
 
+/*
+ * CVodeSetInterpolateStopTime
+ *
+ * Specifies to use interpolation to fill the returned solution at the stop time (instead of a copy).
+ */
+
+int CVodeSetInterpolateStopTime(void *cvode_mem, booleantype interp)
+{
+  CVodeMem cv_mem;
+
+  if (cvode_mem==NULL) {
+    cvProcessError(NULL, CV_MEM_NULL, "CVODES", "CVodeSetInterpolateStopTime", MSGCV_NO_MEM);
+    return (CV_MEM_NULL);
+  }
+  cv_mem = (CVodeMem) cvode_mem;
+  cv_mem->cv_tstopinterp = interp;
+  return(CV_SUCCESS);
+}
+
 /*
  * CVodeClearStopTime
  *
diff --git a/src/cvodes/cvodes_ls.c b/src/cvodes/cvodes_ls.c
index b7477bebca..14d12a9bc6 100644
--- a/src/cvodes/cvodes_ls.c
+++ b/src/cvodes/cvodes_ls.c
@@ -1723,7 +1723,8 @@ int cvLsSolve(CVodeMem cv_mem, N_Vector b, N_Vector weight,
               N_Vector ynow, N_Vector fnow)
 {
   CVLsMem  cvls_mem;
-  realtype bnorm, deltar, delta, w_mean;
+  realtype bnorm = ZERO;
+  realtype deltar, delta, w_mean;
   int      curiter, nli_inc, retval;
   booleantype do_sensi_sim, do_sensi_stg, do_sensi_stg1;
 #if SUNDIALS_LOGGING_LEVEL >= SUNDIALS_LOGGING_DEBUG
diff --git a/src/sundials/sundials_futils.c b/src/sundials/sundials_futils.c
index 194a917ae2..7d32c597db 100644
--- a/src/sundials/sundials_futils.c
+++ b/src/sundials/sundials_futils.c
@@ -15,15 +15,37 @@
  * -----------------------------------------------------------------*/
 
 #include <sundials/sundials_futils.h>
+#include <string.h>
 
 /* Create a file pointer with the given file name and mode. */
 FILE* SUNDIALSFileOpen(const char* filename, const char* mode)
 {
-  return fopen(filename, mode);
+  FILE* fp = NULL;
+
+  if (filename)
+  {
+    if (!strcmp(filename, "stdout"))
+    {
+      fp = stdout;
+    }
+    else if (!strcmp(filename, "stderr"))
+    {
+      fp = stderr;
+    }
+    else
+    {
+      fp = fopen(filename, mode);
+    }
+  }
+
+  return fp;
 }
 
 /* Close a file pointer with the given file name. */
 void SUNDIALSFileClose(FILE* fp)
 {
-  fclose(fp);
+  if (fp && (fp != stdout) && (fp != stderr))
+  {
+    fclose(fp);
+  }
 }
diff --git a/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp b/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp
index d1c7165ed0..72e87f53f9 100644
--- a/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp
+++ b/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp
@@ -180,6 +180,13 @@ SUNLinearSolver SUNLinSol_OneMklDense(N_Vector y, SUNMatrix Amat, SUNContext sun
 
   if (num_blocks > 1)
   {
+#ifdef SUNDIALS_ONEMKL_USE_GETRF_LOOP
+    LS_F_SCRATCH_SIZE(S) =
+      getrf_scratchpad_size<realtype>(*queue, // device queue
+                                      M,      // rows in A_i
+                                      N,      // columns in A_i
+                                      M);     // leading dimension
+#else
     LS_F_SCRATCH_SIZE(S) =
       getrf_batch_scratchpad_size<realtype>(*queue,      // device queue
                                             M,           // rows in A_i
@@ -188,8 +195,17 @@ SUNLinearSolver SUNLinSol_OneMklDense(N_Vector y, SUNMatrix Amat, SUNContext sun
                                             M * N,       // stride between A_i
                                             M,           // stride in P_i
                                             num_blocks); // number of blocks
+#endif
 
-#ifdef SUNDIALS_ONEMKL_USE_GETRS_BATCHED
+#ifdef SUNDIALS_ONEMKL_USE_GETRS_LOOP
+    LS_S_SCRATCH_SIZE(S) =
+      getrs_scratchpad_size<realtype>(*queue,  // device queue
+                                      oneapi::mkl::transpose::nontrans,
+                                      M,      // number of rows in A
+                                      1,      // number of right-hand sizes
+                                      M,      // leading dimension of A
+                                      M);     // leading dimension of B
+#else
     LS_S_SCRATCH_SIZE(S)=
       getrs_batch_scratchpad_size<realtype>(*queue,      // device queue
                                             oneapi::mkl::transpose::nontrans,
@@ -201,14 +217,6 @@ SUNLinearSolver SUNLinSol_OneMklDense(N_Vector y, SUNMatrix Amat, SUNContext sun
                                             M,           // leading dimension of B_i
                                             M,           // stride between B_i
                                             num_blocks); // number of blocks
-#else
-    LS_S_SCRATCH_SIZE(S) =
-      getrs_scratchpad_size<realtype>(*queue,  // device queue
-                                      oneapi::mkl::transpose::nontrans,
-                                      M,      // number of rows in A
-                                      1,      // number of right-hand sizes
-                                      M,      // leading dimension of A
-                                      M);     // leading dimension of B
 #endif
   }
   else
@@ -326,6 +334,36 @@ int SUNLinSolSetup_OneMklDense(SUNLinearSolver S, SUNMatrix A)
 
   if (num_blocks > 1)
   {
+#ifdef SUNDIALS_ONEMKL_USE_GETRF_LOOP
+    try
+    {
+      for (sunindextype i = 0; i < num_blocks; i++)
+      {
+        getrf(*queue,            // device queue
+              M,                 // number of rows
+              N,                 // number of columns
+              Adata + i * M * N, // matrix data
+              M,                 // leading dimension of A
+              pivots + i * M,    // array of pivots
+              scratchpad,        // scratchpad memory
+              scratch_size);     // scratchpad size
+      }
+    }
+    catch(oneapi::mkl::lapack::exception const& e)
+    {
+      SUNDIALS_DEBUG_ERROR("An exception occured in getrf\n");
+      if (e.info())
+      {
+        // An illegal value was providied or the scratch pad is too small
+        ier = -1;
+      }
+      else
+      {
+        // The diagonal element of some of U_i is zero
+        ier = 1;
+      }
+    }
+#else
     try
     {
       getrf_batch(*queue,         // device queue
@@ -354,6 +392,7 @@ int SUNLinSolSetup_OneMklDense(SUNLinearSolver S, SUNMatrix A)
         ier = 1;
       }
     }
+#endif
   }
   else
   {
@@ -467,7 +506,30 @@ int SUNLinSolSolve_OneMklDense(SUNLinearSolver S, SUNMatrix A, N_Vector x,
 
   if (num_blocks > 1)
   {
-#ifdef SUNDIALS_ONEMKL_USE_GETRS_BATCHED
+#ifdef SUNDIALS_ONEMKL_USE_GETRS_LOOP
+    try
+    {
+      for (sunindextype i = 0; i < num_blocks; i++)
+      {
+        getrs(*queue,            // device queue
+              oneapi::mkl::transpose::nontrans,
+              M,                 // number of rows
+              1,                 // number of right-hand sides
+              Adata + i * M * N, // factorized matrix data
+              M,                 // leading dimension of A
+              pivots,            // array of pivots
+              xdata + i * M,     // right-hand side data
+              M,                 // leading dimension of B_i
+              scratchpad,        // scratchpad memory
+              scratch_size);     // scratchpad size
+      }
+    }
+    catch(oneapi::mkl::lapack::exception const& e)
+    {
+      SUNDIALS_DEBUG_ERROR("An exception occured in getrs\n");
+      ier = -1;
+    }
+#else
     try
     {
       getrs_batch(*queue,        // device queue
@@ -491,29 +553,6 @@ int SUNLinSolSolve_OneMklDense(SUNLinearSolver S, SUNMatrix A, N_Vector x,
       SUNDIALS_DEBUG_ERROR("An exception occured in getrs_batch\n");
       ier = -1;
     }
-#else
-    try
-    {
-      for (sunindextype i = 0; i < num_blocks; i++)
-      {
-        getrs(*queue,            // device queue
-              oneapi::mkl::transpose::nontrans,
-              M,                 // number of rows
-              1,                 // number of right-hand sides
-              Adata + i * M * N, // factorized matrix data
-              M,                 // leading dimension of A
-              pivots,            // array of pivots
-              xdata + i * M,     // right-hand side data
-              M,                 // leading dimension of B_i
-              scratchpad,        // scratchpad memory
-              scratch_size);     // scratchpad size
-      }
-    }
-    catch(oneapi::mkl::lapack::exception const& e)
-    {
-      SUNDIALS_DEBUG_ERROR("An exception occured in getrs\n");
-      ier = -1;
-    }
 #endif
   }
   else
diff --git a/test/answers b/test/answers
index adc6da31cd..96d6e170c1 160000
--- a/test/answers
+++ b/test/answers
@@ -1 +1 @@
-Subproject commit adc6da31cd21bfa6e70d6fb026510008643f8ebb
+Subproject commit 96d6e170c15f997d1e9062d4e6478e618d3f30ca
diff --git a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp
index d89ecacc39..8ad033603d 100644
--- a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp
+++ b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp
@@ -29,13 +29,13 @@
 int main() {
 
   // set vectors of individual tables to test
-  std::vector<std::string> Tables_ERK = {"ARKODE_HEUN_EULER_2_1_2",
+  std::vector<std::string> Tables_ERK = {"ARKODE_HEUN_EULER_2_1_2", "ARKODE_ARK2_ERK_3_1_2",
     "ARKODE_BOGACKI_SHAMPINE_4_2_3", "ARKODE_ARK324L2SA_ERK_4_2_3", "ARKODE_ZONNEVELD_5_3_4",
     "ARKODE_ARK436L2SA_ERK_6_3_4", "ARKODE_SAYFY_ABURUB_6_3_4", "ARKODE_CASH_KARP_6_4_5",
     "ARKODE_FEHLBERG_6_4_5", "ARKODE_DORMAND_PRINCE_7_4_5", "ARKODE_ARK548L2SA_ERK_8_4_5",
     "ARKODE_VERNER_8_5_6", "ARKODE_FEHLBERG_13_7_8", "ARKODE_ARK437L2SA_ERK_7_3_4",
     "ARKODE_ARK548L2SAb_ERK_8_4_5"};
-  std::vector<std::string> Tables_DIRK = {"ARKODE_SDIRK_2_1_2",
+  std::vector<std::string> Tables_DIRK = {"ARKODE_SDIRK_2_1_2", "ARKODE_ARK2_DIRK_3_1_2",
     "ARKODE_BILLINGTON_3_3_2", "ARKODE_TRBDF2_3_3_2", "ARKODE_KVAERNO_4_2_3",
     "ARKODE_ARK324L2SA_DIRK_4_2_3", "ARKODE_CASH_5_2_4", "ARKODE_CASH_5_3_4",
     "ARKODE_SDIRK_5_3_4", "ARKODE_KVAERNO_5_3_4", "ARKODE_ARK436L2SA_DIRK_6_3_4",
@@ -44,15 +44,15 @@ int main() {
     "ARKODE_ESDIRK324L2SA_4_2_3", "ARKODE_ESDIRK325L2SA_5_2_3", "ARKODE_ESDIRK32I5L2SA_5_2_3",
     "ARKODE_ESDIRK436L2SA_6_3_4", "ARKODE_ESDIRK43I6L2SA_6_3_4", "ARKODE_QESDIRK436L2SA_6_3_4",
     "ARKODE_ESDIRK437L2SA_7_3_4", "ARKODE_ESDIRK547L2SA_7_4_5", "ARKODE_ESDIRK547L2SA2_7_4_5"};
-  std::vector<ARKODE_ERKTableID> Tables_ARK_ERK = {ARKODE_ARK324L2SA_ERK_4_2_3,
+  std::vector<ARKODE_ERKTableID> Tables_ARK_ERK = {ARKODE_ARK2_ERK_3_1_2, ARKODE_ARK324L2SA_ERK_4_2_3,
     ARKODE_ARK436L2SA_ERK_6_3_4, ARKODE_ARK437L2SA_ERK_7_3_4, ARKODE_ARK548L2SA_ERK_8_4_5,
     ARKODE_ARK548L2SAb_ERK_8_4_5};
-  std::vector<ARKODE_DIRKTableID> Tables_ARK_DIRK = {ARKODE_ARK324L2SA_DIRK_4_2_3,
+  std::vector<ARKODE_DIRKTableID> Tables_ARK_DIRK = {ARKODE_ARK2_DIRK_3_1_2, ARKODE_ARK324L2SA_DIRK_4_2_3,
     ARKODE_ARK436L2SA_DIRK_6_3_4, ARKODE_ARK437L2SA_DIRK_7_3_4, ARKODE_ARK548L2SA_DIRK_8_4_5,
     ARKODE_ARK548L2SAb_DIRK_8_4_5};
-  std::vector<std::string> STables_ARK = {"ARKODE_ARK324L2SA_4_2_3", "ARKODE_ARK436L2SA_6_3_4",
-                                          "ARKODE_ARK437L2SA_7_3_4", "ARKODE_ARK548L2SA_8_4_5",
-                                          "ARKODE_ARK548L2SAb_8_4_5"};
+  std::vector<std::string> STables_ARK = {"ARKODE_ARK2_3_1_2", "ARKODE_ARK324L2SA_4_2_3",
+                                          "ARKODE_ARK436L2SA_6_3_4", "ARKODE_ARK437L2SA_7_3_4",
+                                          "ARKODE_ARK548L2SA_8_4_5", "ARKODE_ARK548L2SAb_8_4_5"};
   int numfails = 0;
 
   // loop over individual ERK tables
diff --git a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out
index 054defea65..8d7971338b 100644
--- a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out
+++ b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out
@@ -2,6 +2,7 @@
 Testing individual ERK methods:
 
 Testing method ARKODE_HEUN_EULER_2_1_2:  table matches predicted method/embedding orders of 2/1
+Testing method ARKODE_ARK2_ERK_3_1_2:  table matches predicted method/embedding orders of 2/1
 Testing method ARKODE_BOGACKI_SHAMPINE_4_2_3:  table matches predicted method/embedding orders of 3/2
 Testing method ARKODE_ARK324L2SA_ERK_4_2_3:  table matches predicted method/embedding orders of 3/2
 Testing method ARKODE_ZONNEVELD_5_3_4:  table matches predicted method/embedding orders of 4/3
@@ -25,6 +26,7 @@ Testing method ARKODE_ARK548L2SAb_ERK_8_4_5:  table matches predicted method/emb
 Testing individual DIRK methods:
 
 Testing method ARKODE_SDIRK_2_1_2:  table matches predicted method/embedding orders of 2/1
+Testing method ARKODE_ARK2_DIRK_3_1_2:  table matches predicted method/embedding orders of 2/1
 Testing method ARKODE_BILLINGTON_3_3_2:  table matches predicted method/embedding orders of 2/3
 Testing method ARKODE_TRBDF2_3_3_2:  table matches predicted method/embedding orders of 2/3
 Testing method ARKODE_KVAERNO_4_2_3:  table matches predicted method/embedding orders of 3/2
@@ -50,6 +52,7 @@ Testing method ARKODE_ESDIRK547L2SA2_7_4_5:  table matches predicted method/embe
 
 Testing ARK pairs:
 
+Testing method ARKODE_ARK2_3_1_2:  Method/embedding match predicted orders of 2/1
 Testing method ARKODE_ARK324L2SA_4_2_3:  Method/embedding match predicted orders of 3/2
 Testing method ARKODE_ARK436L2SA_6_3_4:  Method/embedding match predicted orders of 4/3
 Testing method ARKODE_ARK437L2SA_7_3_4:  Method/embedding match predicted orders of 4/3
diff --git a/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp b/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp
index 325d8654d1..52e1dab5dd 100644
--- a/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp
+++ b/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp
@@ -20,7 +20,7 @@ int test_instance(SUNMemoryHelper helper, SUNMemoryType mem_type,
                   bool print_test_status)
 {
   // Create an in-order GPU queue
-#if SYCL_LANGUAGE_VERSION >= 2020
+#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED)
   sycl::queue myQueue(sycl::gpu_selector_v,
                       sycl::property_list{sycl::property::queue::in_order{}});
 #else