From 420b180741cd44ae905abcbe58c7bbedbf110ba3 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Fri, 31 Jan 2025 12:29:05 -0800
Subject: [PATCH] Run clang-format to resolve merge conflicts.

---
 CMakeLists.txt                                |    7 +-
 cmake/RAJAMacros.cmake                        |   61 +-
 include/RAJA/RAJA.hpp                         |   28 +-
 include/RAJA/index/IndexSet.hpp               |   28 +-
 include/RAJA/index/IndexSetBuilders.hpp       |   36 +-
 include/RAJA/index/IndexSetUtils.hpp          |   42 +-
 include/RAJA/index/IndexValue.hpp             |   59 +-
 include/RAJA/index/ListSegment.hpp            |   58 +-
 include/RAJA/index/RangeSegment.hpp           |   87 +-
 include/RAJA/internal/DepGraphNode.hpp        |    3 +-
 include/RAJA/internal/Iterators.hpp           |   12 +-
 include/RAJA/internal/MemUtils_CPU.hpp        |   20 +-
 include/RAJA/internal/RAJAVec.hpp             |  114 +-
 include/RAJA/internal/fault_tolerance.hpp     |   26 +-
 include/RAJA/internal/foldl.hpp               |   20 +-
 include/RAJA/internal/get_platform.hpp        |   28 +-
 include/RAJA/pattern/WorkGroup.hpp            |  216 +-
 include/RAJA/pattern/WorkGroup/Dispatcher.hpp |  424 +--
 include/RAJA/pattern/WorkGroup/WorkRunner.hpp |  195 +-
 .../RAJA/pattern/WorkGroup/WorkStorage.hpp    |  466 ++-
 include/RAJA/pattern/WorkGroup/WorkStruct.hpp |   62 +-
 include/RAJA/pattern/atomic.hpp               |   31 +-
 include/RAJA/pattern/detail/algorithm.hpp     |   27 +-
 include/RAJA/pattern/detail/multi_reduce.hpp  |  218 +-
 include/RAJA/pattern/detail/reduce.hpp        |  108 +-
 include/RAJA/pattern/forall.hpp               |  437 +--
 include/RAJA/pattern/kernel.hpp               |   85 +-
 include/RAJA/pattern/kernel/Conditional.hpp   |    7 +-
 include/RAJA/pattern/kernel/For.hpp           |   25 +-
 include/RAJA/pattern/kernel/ForICount.hpp     |   27 +-
 include/RAJA/pattern/kernel/Hyperplane.hpp    |   27 +-
 include/RAJA/pattern/kernel/InitLocalMem.hpp  |   55 +-
 include/RAJA/pattern/kernel/Lambda.hpp        |  210 +-
 include/RAJA/pattern/kernel/Param.hpp         |    9 +-
 include/RAJA/pattern/kernel/Reduce.hpp        |    6 +-
 include/RAJA/pattern/kernel/Region.hpp        |   30 +-
 include/RAJA/pattern/kernel/Tile.hpp          |   63 +-
 include/RAJA/pattern/kernel/TileTCount.hpp    |   36 +-
 .../RAJA/pattern/kernel/internal/LoopData.hpp |   96 +-
 .../pattern/kernel/internal/LoopTypes.hpp     |   72 +-
 .../pattern/kernel/internal/Statement.hpp     |   15 +-
 .../pattern/kernel/internal/StatementList.hpp |   16 +-
 .../RAJA/pattern/kernel/internal/Template.hpp |   22 +-
 include/RAJA/pattern/launch/launch_core.hpp   |  415 +--
 include/RAJA/pattern/multi_reduce.hpp         |    8 +-
 include/RAJA/pattern/params/forall.hpp        |  756 ++---
 include/RAJA/pattern/params/kernel_name.hpp   |   24 +-
 include/RAJA/pattern/params/params_base.hpp   |  358 ++-
 include/RAJA/pattern/params/reducer.hpp       |  302 +-
 include/RAJA/pattern/reduce.hpp               |    5 +-
 include/RAJA/pattern/scan.hpp                 |  227 +-
 include/RAJA/pattern/sort.hpp                 |  198 +-
 include/RAJA/pattern/tensor.hpp               |   14 +-
 .../RAJA/pattern/tensor/MatrixRegister.hpp    |   42 +-
 .../RAJA/pattern/tensor/ScalarRegister.hpp    |   14 +-
 include/RAJA/pattern/tensor/TensorBlock.hpp   |    7 +-
 include/RAJA/pattern/tensor/TensorIndex.hpp   |  366 +--
 include/RAJA/pattern/tensor/TensorLayout.hpp  |   81 +-
 .../RAJA/pattern/tensor/TensorRegister.hpp    |  139 +-
 .../RAJA/pattern/tensor/VectorRegister.hpp    |   19 +-
 .../tensor/internal/ET/BinaryOperator.hpp     |  240 +-
 .../internal/ET/BinaryOperatorTraits.hpp      |  235 +-
 .../tensor/internal/ET/BlockLiteral.hpp       |  149 +-
 .../internal/ET/ExpressionTemplateBase.hpp    |  245 +-
 .../tensor/internal/ET/MultiplyOperator.hpp   | 2218 +++++++-------
 .../tensor/internal/ET/TensorDivide.hpp       |  699 ++---
 .../tensor/internal/ET/TensorLiteral.hpp      |  133 +-
 .../tensor/internal/ET/TensorLoadStore.hpp    |  365 ++-
 .../tensor/internal/ET/TensorMultiply.hpp     |  254 +-
 .../tensor/internal/ET/TensorMultiplyAdd.hpp  |  154 +-
 .../tensor/internal/ET/TensorNegate.hpp       |  107 +-
 .../internal/ET/TensorScalarLiteral.hpp       |  135 +-
 .../tensor/internal/ET/TensorTranspose.hpp    |  118 +-
 .../tensor/internal/ET/normalizeOperand.hpp   |   88 +-
 .../tensor/internal/ExpressionTemplate.hpp    |    3 +-
 .../tensor/internal/MatrixMatrixMultiply.hpp  |  527 ++--
 .../tensor/internal/MatrixRegisterImpl.hpp    | 2608 +++++++++--------
 .../pattern/tensor/internal/RegisterBase.hpp  | 2032 +++++++------
 .../tensor/internal/TensorIndexTraits.hpp     |  588 ++--
 .../pattern/tensor/internal/TensorRef.hpp     | 1322 +++++----
 .../tensor/internal/TensorRegisterBase.hpp    | 1532 +++++-----
 .../tensor/internal/TensorTileExec.hpp        |  588 ++--
 .../tensor/internal/VectorRegisterImpl.hpp    | 1725 +++++------
 include/RAJA/pattern/tensor/stats.hpp         |   10 +-
 include/RAJA/policy/MultiPolicy.hpp           |   32 +-
 include/RAJA/policy/PolicyBase.hpp            |   34 +-
 include/RAJA/policy/WorkGroup.hpp             |   54 +-
 include/RAJA/policy/atomic_auto.hpp           |   15 +-
 include/RAJA/policy/atomic_builtin.hpp        |  187 +-
 include/RAJA/policy/cuda.hpp                  |   10 +-
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp    |  189 +-
 .../RAJA/policy/cuda/WorkGroup/Dispatcher.hpp |   49 +-
 .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp |  270 +-
 include/RAJA/policy/cuda/atomic.hpp           |  363 +--
 include/RAJA/policy/cuda/forall.hpp           |  698 +++--
 include/RAJA/policy/cuda/intrinsics.hpp       |   99 +-
 .../RAJA/policy/cuda/kernel/Conditional.hpp   |   19 +-
 .../RAJA/policy/cuda/kernel/CudaKernel.hpp    |  216 +-
 include/RAJA/policy/cuda/kernel/For.hpp       |  203 +-
 include/RAJA/policy/cuda/kernel/ForICount.hpp |  330 ++-
 .../RAJA/policy/cuda/kernel/Hyperplane.hpp    |   41 +-
 .../RAJA/policy/cuda/kernel/InitLocalMem.hpp  |  172 +-
 include/RAJA/policy/cuda/kernel/Lambda.hpp    |   32 +-
 include/RAJA/policy/cuda/kernel/Reduce.hpp    |   15 +-
 include/RAJA/policy/cuda/kernel/Sync.hpp      |   34 +-
 include/RAJA/policy/cuda/kernel/Tile.hpp      |  161 +-
 .../RAJA/policy/cuda/kernel/TileTCount.hpp    |  229 +-
 include/RAJA/policy/cuda/kernel/internal.hpp  |  781 ++---
 include/RAJA/policy/cuda/launch.hpp           | 1280 ++++----
 include/RAJA/policy/cuda/multi_reduce.hpp     |  461 +--
 .../RAJA/policy/cuda/params/kernel_name.hpp   |   70 +-
 include/RAJA/policy/cuda/params/reduce.hpp    |  105 +-
 include/RAJA/policy/cuda/policy.hpp           | 1775 ++++++-----
 include/RAJA/policy/cuda/raja_cudaerrchk.hpp  |   13 +-
 include/RAJA/policy/cuda/reduce.hpp           |  349 ++-
 include/RAJA/policy/cuda/scan.hpp             |   63 +-
 include/RAJA/policy/cuda/sort.hpp             |  534 ++--
 include/RAJA/policy/desul/atomic.hpp          |   56 +-
 include/RAJA/policy/hip.hpp                   |    8 +-
 include/RAJA/policy/hip/MemUtils_HIP.hpp      |  214 +-
 .../RAJA/policy/hip/WorkGroup/Dispatcher.hpp  |   42 +-
 .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp  |  302 +-
 include/RAJA/policy/hip/atomic.hpp            |  367 ++-
 include/RAJA/policy/hip/forall.hpp            |  683 +++--
 include/RAJA/policy/hip/intrinsics.hpp        |   86 +-
 .../RAJA/policy/hip/kernel/Conditional.hpp    |   23 +-
 include/RAJA/policy/hip/kernel/For.hpp        |  208 +-
 include/RAJA/policy/hip/kernel/ForICount.hpp  |  332 ++-
 include/RAJA/policy/hip/kernel/HipKernel.hpp  |  194 +-
 include/RAJA/policy/hip/kernel/Hyperplane.hpp |   41 +-
 .../RAJA/policy/hip/kernel/InitLocalMem.hpp   |  174 +-
 include/RAJA/policy/hip/kernel/Lambda.hpp     |   32 +-
 include/RAJA/policy/hip/kernel/Reduce.hpp     |   36 +-
 include/RAJA/policy/hip/kernel/Sync.hpp       |   30 +-
 include/RAJA/policy/hip/kernel/Tile.hpp       |  161 +-
 include/RAJA/policy/hip/kernel/TileTCount.hpp |  223 +-
 include/RAJA/policy/hip/kernel/internal.hpp   |  782 ++---
 include/RAJA/policy/hip/launch.hpp            | 1252 ++++----
 include/RAJA/policy/hip/multi_reduce.hpp      |  459 +--
 .../RAJA/policy/hip/params/kernel_name.hpp    |   69 +-
 include/RAJA/policy/hip/params/reduce.hpp     |  104 +-
 include/RAJA/policy/hip/policy.hpp            | 1562 +++++-----
 include/RAJA/policy/hip/raja_hiperrchk.hpp    |   16 +-
 include/RAJA/policy/hip/reduce.hpp            |  337 ++-
 include/RAJA/policy/hip/scan.hpp              |   78 +-
 include/RAJA/policy/hip/sort.hpp              |  443 +--
 include/RAJA/policy/openmp.hpp                |    9 +-
 .../policy/openmp/WorkGroup/Dispatcher.hpp    |    8 +-
 .../policy/openmp/WorkGroup/WorkRunner.hpp    |   68 +-
 include/RAJA/policy/openmp/atomic.hpp         |   58 +-
 include/RAJA/policy/openmp/forall.hpp         |  470 +--
 .../RAJA/policy/openmp/kernel/Collapse.hpp    |   25 +-
 .../policy/openmp/kernel/OmpSyncThreads.hpp   |   26 +-
 include/RAJA/policy/openmp/launch.hpp         |   89 +-
 include/RAJA/policy/openmp/multi_reduce.hpp   |  205 +-
 include/RAJA/policy/openmp/params/forall.hpp  |  579 ++--
 .../RAJA/policy/openmp/params/kernel_name.hpp |   60 +-
 include/RAJA/policy/openmp/params/reduce.hpp  |   61 +-
 include/RAJA/policy/openmp/policy.hpp         |  179 +-
 include/RAJA/policy/openmp/reduce.hpp         |    8 +-
 include/RAJA/policy/openmp/region.hpp         |   10 +-
 include/RAJA/policy/openmp/scan.hpp           |  110 +-
 include/RAJA/policy/openmp/sort.hpp           |  155 +-
 include/RAJA/policy/openmp_target.hpp         |    7 +-
 .../openmp_target/WorkGroup/Dispatcher.hpp    |   26 +-
 .../openmp_target/WorkGroup/WorkRunner.hpp    |   68 +-
 include/RAJA/policy/openmp_target/forall.hpp  |   95 +-
 .../policy/openmp_target/kernel/Collapse.hpp  |   86 +-
 .../RAJA/policy/openmp_target/kernel/For.hpp  |   46 +-
 .../openmp_target/params/kernel_name.hpp      |   59 +-
 .../policy/openmp_target/params/reduce.hpp    |   60 +-
 include/RAJA/policy/openmp_target/policy.hpp  |   49 +-
 include/RAJA/policy/openmp_target/reduce.hpp  |   69 +-
 include/RAJA/policy/sequential.hpp            |    8 +-
 .../sequential/WorkGroup/Dispatcher.hpp       |   12 +-
 .../sequential/WorkGroup/WorkRunner.hpp       |   66 +-
 include/RAJA/policy/sequential/atomic.hpp     |   46 +-
 include/RAJA/policy/sequential/forall.hpp     |   42 +-
 .../policy/sequential/kernel/Collapse.hpp     |   17 +-
 .../RAJA/policy/sequential/kernel/Reduce.hpp  |    3 +-
 include/RAJA/policy/sequential/launch.hpp     |   62 +-
 .../RAJA/policy/sequential/multi_reduce.hpp   |   60 +-
 .../policy/sequential/params/kernel_name.hpp  |   69 +-
 .../RAJA/policy/sequential/params/reduce.hpp  |   57 +-
 include/RAJA/policy/sequential/policy.hpp     |   40 +-
 include/RAJA/policy/sequential/reduce.hpp     |    4 -
 include/RAJA/policy/sequential/scan.hpp       |   81 +-
 include/RAJA/policy/sequential/sort.hpp       |   96 +-
 include/RAJA/policy/simd.hpp                  |    4 +-
 include/RAJA/policy/simd/forall.hpp           |   30 +-
 include/RAJA/policy/simd/kernel/For.hpp       |   15 +-
 include/RAJA/policy/simd/kernel/ForICount.hpp |   25 +-
 include/RAJA/policy/sycl.hpp                  |    3 +-
 include/RAJA/policy/sycl/MemUtils_SYCL.hpp    |    7 +-
 include/RAJA/policy/sycl/forall.hpp           |  253 +-
 include/RAJA/policy/sycl/kernel.hpp           |    2 +-
 .../RAJA/policy/sycl/kernel/Conditional.hpp   |   21 +-
 include/RAJA/policy/sycl/kernel/For.hpp       |  139 +-
 include/RAJA/policy/sycl/kernel/ForICount.hpp |  244 +-
 include/RAJA/policy/sycl/kernel/Lambda.hpp    |   35 +-
 .../RAJA/policy/sycl/kernel/SyclKernel.hpp    |  106 +-
 include/RAJA/policy/sycl/kernel/Tile.hpp      |  158 +-
 .../RAJA/policy/sycl/kernel/TileTCount.hpp    |  227 +-
 include/RAJA/policy/sycl/kernel/internal.hpp  |  110 +-
 include/RAJA/policy/sycl/launch.hpp           |  992 +++----
 .../RAJA/policy/sycl/params/kernel_name.hpp   |   76 +-
 include/RAJA/policy/sycl/params/reduce.hpp    |   61 +-
 include/RAJA/policy/sycl/policy.hpp           |  133 +-
 include/RAJA/policy/sycl/reduce.hpp           |  197 +-
 include/RAJA/policy/tensor/arch.hpp           |   58 +-
 include/RAJA/policy/tensor/arch/avx.hpp       |   12 +-
 .../policy/tensor/arch/avx/avx_double.hpp     |  879 +++---
 .../RAJA/policy/tensor/arch/avx/avx_float.hpp |  912 +++---
 .../RAJA/policy/tensor/arch/avx/avx_int32.hpp | 1471 +++++-----
 .../RAJA/policy/tensor/arch/avx/avx_int64.hpp | 1012 +++----
 .../RAJA/policy/tensor/arch/avx/traits.hpp    |   81 +-
 include/RAJA/policy/tensor/arch/avx2.hpp      |   12 +-
 .../policy/tensor/arch/avx2/avx2_double.hpp   | 1003 +++----
 .../policy/tensor/arch/avx2/avx2_float.hpp    |  964 +++---
 .../policy/tensor/arch/avx2/avx2_int32.hpp    | 1100 +++----
 .../policy/tensor/arch/avx2/avx2_int64.hpp    | 1034 +++----
 .../RAJA/policy/tensor/arch/avx2/traits.hpp   |  105 +-
 include/RAJA/policy/tensor/arch/avx512.hpp    |   12 +-
 .../tensor/arch/avx512/avx512_double.hpp      |  713 ++---
 .../tensor/arch/avx512/avx512_float.hpp       |  736 ++---
 .../tensor/arch/avx512/avx512_int32.hpp       |  866 +++---
 .../tensor/arch/avx512/avx512_int64.hpp       |  759 ++---
 .../RAJA/policy/tensor/arch/avx512/traits.hpp |   84 +-
 include/RAJA/policy/tensor/arch/cuda.hpp      |    6 +-
 .../policy/tensor/arch/cuda/cuda_warp.hpp     | 1945 ++++++------
 .../RAJA/policy/tensor/arch/cuda/traits.hpp   |   38 +-
 include/RAJA/policy/tensor/arch/hip.hpp       |    6 +-
 .../RAJA/policy/tensor/arch/hip/hip_wave.hpp  | 1945 ++++++------
 .../RAJA/policy/tensor/arch/hip/traits.hpp    |   35 +-
 include/RAJA/policy/tensor/arch/scalar.hpp    |    8 +-
 .../RAJA/policy/tensor/arch/scalar/scalar.hpp |  886 +++---
 .../RAJA/policy/tensor/arch/scalar/traits.hpp |   81 +-
 include/RAJA/policy/tensor/arch_impl.hpp      |   14 +-
 include/RAJA/policy/tensor/policy.hpp         |   30 +-
 include/RAJA/util/BitMask.hpp                 |   96 +-
 include/RAJA/util/CombiningAdapter.hpp        |   89 +-
 include/RAJA/util/EnableIf.hpp                |   14 +-
 include/RAJA/util/IndexLayout.hpp             |  111 +-
 include/RAJA/util/KokkosPluginLoader.hpp      |   53 +-
 include/RAJA/util/Layout.hpp                  |   87 +-
 include/RAJA/util/LocalArray.hpp              |   91 +-
 include/RAJA/util/OffsetLayout.hpp            |   98 +-
 include/RAJA/util/OffsetOperators.hpp         |   61 +-
 include/RAJA/util/Operators.hpp               |   59 +-
 include/RAJA/util/Permutations.hpp            |   51 +-
 include/RAJA/util/PermutedLayout.hpp          |    6 +-
 include/RAJA/util/PluginContext.hpp           |   27 +-
 include/RAJA/util/PluginLinker.hpp            |   23 +-
 include/RAJA/util/PluginOptions.hpp           |   21 +-
 include/RAJA/util/PluginStrategy.hpp          |   28 +-
 include/RAJA/util/Registry.hpp                |  236 +-
 include/RAJA/util/RepeatView.hpp              |  126 +-
 include/RAJA/util/RuntimePluginLoader.hpp     |   41 +-
 include/RAJA/util/SoAPtr.hpp                  |  100 +-
 include/RAJA/util/Span.hpp                    |   46 +-
 include/RAJA/util/StaticLayout.hpp            |  143 +-
 include/RAJA/util/Timer.hpp                   |    1 +
 include/RAJA/util/TypeConvert.hpp             |    5 +-
 include/RAJA/util/TypedViewBase.hpp           | 1277 ++++----
 include/RAJA/util/View.hpp                    |  185 +-
 include/RAJA/util/align.hpp                   |   10 +-
 include/RAJA/util/basic_mempool.hpp           |    8 +-
 include/RAJA/util/camp_aliases.hpp            |    3 +-
 include/RAJA/util/concepts.hpp                |    6 +-
 include/RAJA/util/for_each.hpp                |   40 +-
 include/RAJA/util/macros.hpp                  |   25 +-
 include/RAJA/util/math.hpp                    |   47 +-
 include/RAJA/util/plugins.hpp                 |   84 +-
 include/RAJA/util/reduce.hpp                  |  164 +-
 include/RAJA/util/resource.hpp                |  279 +-
 include/RAJA/util/sort.hpp                    |  396 ++-
 include/RAJA/util/types.hpp                   |  138 +-
 include/RAJA/util/zip.hpp                     |   91 +-
 include/RAJA/util/zip_tuple.hpp               |  438 +--
 scripts/lc-builds/toss4_clang-format.sh       |   43 +
 src/AlignedRangeIndexSetBuilders.cpp          |   13 +-
 src/DepGraphNode.cpp                          |    4 +-
 src/KokkosPluginLoader.cpp                    |  102 +-
 src/LockFreeIndexSetBuilders.cpp              |   21 +-
 src/MemUtils_CUDA.cpp                         |    1 -
 src/MemUtils_HIP.cpp                          |    1 -
 src/PluginStrategy.cpp                        |   22 +-
 src/RuntimePluginLoader.cpp                   |  101 +-
 src/TensorStats.cpp                           |   15 +-
 289 files changed, 35855 insertions(+), 32126 deletions(-)
 create mode 100755 scripts/lc-builds/toss4_clang-format.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3eb0dbc8d2..f31e3b1795 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJA/LICENSE file for details.
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
@@ -41,7 +41,7 @@ project(RAJA LANGUAGES CXX C
   VERSION ${RAJA_LOADED})
 
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/thirdparty" ${CMAKE_MODULE_PATH})
-
+set(BLT_REQUIRED_CLANGFORMAT_VERSION  "14" CACHE STRING "")
 include(cmake/SetupRajaOptions.cmake)
 
 cmake_minimum_required(VERSION 3.23)
@@ -136,6 +136,9 @@ include(cmake/SetupCompilers.cmake)
 # Macros for building executables and libraries
 include (cmake/RAJAMacros.cmake)
 
+# Configure `style` target for enforcing code style
+raja_add_code_checks()
+
 set (raja_sources
   src/AlignedRangeIndexSetBuilders.cpp
   src/DepGraphNode.cpp
diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake
index c412593db7..848f5779e4 100644
--- a/cmake/RAJAMacros.cmake
+++ b/cmake/RAJAMacros.cmake
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
 # and other RAJA project contributors. See the RAJA/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -204,3 +204,62 @@ macro(raja_add_benchmark)
     NUM_OMP_THREADS ${arg_NUM_OMP_THREADS}
     COMMAND ${TEST_DRIVER} ${arg_NAME})
 endmacro(raja_add_benchmark)
+
+##------------------------------------------------------------------------------
+## raja_add_code_checks()
+##
+## Adds code checks for all source files recursively in the RAJA repository.
+##
+## This creates the following parent build targets:
+##  check - Runs a non file changing style check and CppCheck
+##  style - In-place code formatting
+##
+## Creates various child build targets that follow this pattern:
+##  raja_<check|style>
+##  raja_<cppcheck|clangformat>_<check|style>
+##------------------------------------------------------------------------------
+macro(raja_add_code_checks)
+
+  set(options)
+  set(singleValueArgs)
+  set(multiValueArgs)
+
+  # Parse the arguments to the macro
+  cmake_parse_arguments(arg
+       "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  # Only do code checks if building raja by itself and not included in
+  # another project
+  if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}")
+      # Create file globbing expressions that only include directories that contain source
+      # TODO(bowen) Add examples, exercises, test, and benchmark to the list below
+      set(_base_dirs "RAJA" "include" "src")
+      set(_ext_expressions "*.cpp" "*.hpp" "*.inl"
+                           "*.cxx" "*.hxx" "*.cc" "*.c" "*.h" "*.hh")
+
+      set(_glob_expressions)
+      foreach(_exp ${_ext_expressions})
+          foreach(_base_dir ${_base_dirs})
+              list(APPEND _glob_expressions "${PROJECT_SOURCE_DIR}/${_base_dir}/${_exp}")
+          endforeach()
+      endforeach()
+
+      # Glob for list of files to run code checks on
+      set(_sources)
+      file(GLOB_RECURSE _sources ${_glob_expressions})
+
+      blt_add_code_checks(PREFIX          RAJA
+                          SOURCES         ${_sources}
+                          CLANGFORMAT_CFG_FILE ${PROJECT_SOURCE_DIR}/.clang-format
+                          CPPCHECK_FLAGS  --enable=all --inconclusive)
+
+      # Set FOLDER property for code check targets
+      foreach(_suffix clangformat_check clangformat_style clang_tidy_check clang_tidy_style)
+          set(_tgt ${arg_PREFIX}_${_suffix})
+          if(TARGET ${_tgt})
+              set_target_properties(${_tgt} PROPERTIES FOLDER "RAJA/code_checks")
+          endif()
+      endforeach()
+  endif()
+
+endmacro(raja_add_code_checks)
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index 59cca4bf22..3fe99418a8 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -27,16 +27,15 @@
 #define RAJA_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/util/Operators.hpp"
+#include "RAJA/util/Registry.hpp"
 #include "RAJA/util/basic_mempool.hpp"
 #include "RAJA/util/camp_aliases.hpp"
+#include "RAJA/util/for_each.hpp"
 #include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
 #include "RAJA/util/math.hpp"
 #include "RAJA/util/plugins.hpp"
-#include "RAJA/util/Registry.hpp"
-#include "RAJA/util/for_each.hpp"
+#include "RAJA/util/types.hpp"
 
 
 //
@@ -88,7 +87,7 @@
 #endif
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/desul.hpp"
+#include "RAJA/policy/desul.hpp"
 #endif
 
 #include "RAJA/index/IndexSet.hpp"
@@ -105,18 +104,17 @@
 //
 #include "RAJA/pattern/forall.hpp"
 #include "RAJA/pattern/region.hpp"
-
 #include "RAJA/policy/MultiPolicy.hpp"
 
 
 //
 // Multidimensional layouts and views
 //
+#include "RAJA/util/IndexLayout.hpp"
 #include "RAJA/util/Layout.hpp"
 #include "RAJA/util/OffsetLayout.hpp"
 #include "RAJA/util/PermutedLayout.hpp"
 #include "RAJA/util/StaticLayout.hpp"
-#include "RAJA/util/IndexLayout.hpp"
 #include "RAJA/util/View.hpp"
 
 
@@ -158,14 +156,14 @@
 //
 // WorkPool, WorkGroup, WorkSite objects
 //
-#include "RAJA/policy/WorkGroup.hpp"
 #include "RAJA/pattern/WorkGroup.hpp"
+#include "RAJA/policy/WorkGroup.hpp"
 
 //
 // Reduction objects
 //
-#include "RAJA/pattern/reduce.hpp"
 #include "RAJA/pattern/multi_reduce.hpp"
+#include "RAJA/pattern/reduce.hpp"
 
 
 //
@@ -186,9 +184,8 @@
 //////////////////////////////////////////////////////////////////////
 //
 
-#include "RAJA/index/IndexSetUtils.hpp"
 #include "RAJA/index/IndexSetBuilders.hpp"
-
+#include "RAJA/index/IndexSetUtils.hpp"
 #include "RAJA/pattern/scan.hpp"
 
 #if defined(RAJA_ENABLE_RUNTIME_PLUGINS)
@@ -197,11 +194,14 @@
 
 #include "RAJA/pattern/sort.hpp"
 
-namespace RAJA {
-namespace expt{}
+namespace RAJA
+{
+namespace expt
+{
+}
 //  // provide a RAJA::expt namespace for experimental work, but bring alias
 //  // it into RAJA so it doesn't affect user code
 //  using namespace expt;
-}
+}  // namespace RAJA
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index 1a467c8341..2e12bdb707 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -19,15 +19,11 @@
 #define RAJA_IndexSet_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/index/ListSegment.hpp"
 #include "RAJA/index/RangeSegment.hpp"
-
 #include "RAJA/internal/Iterators.hpp"
 #include "RAJA/internal/RAJAVec.hpp"
-
 #include "RAJA/policy/PolicyBase.hpp"
-
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/concepts.hpp"
 
@@ -91,7 +87,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Construct empty index set
 #if _MSC_VER < 1910
-   // this one instance of constexpr does not work on VS2012 or VS2015
+  // this one instance of constexpr does not work on VS2012 or VS2015
   RAJA_INLINE TypedIndexSet() : PARENT() {}
 #else
   RAJA_INLINE constexpr TypedIndexSet() : PARENT() {}
@@ -240,11 +236,11 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
     if (pend == PUSH_BACK) {
       for (Index_type i = 0; i < num; ++i) {
         segment_push_into(i, c, pend, pcopy);
-      } 
+      }
     } else {
-      for (Index_type i = num-1; i > -1; --i) {
+      for (Index_type i = num - 1; i > -1; --i) {
         segment_push_into(i, c, pend, pcopy);
-      } 
+      }
     }
   }
 
@@ -301,14 +297,18 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   template <typename Tnew>
   RAJA_INLINE void push_back(Tnew &&val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_BACK, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_BACK,
+                  PUSH_COPY);
   }
 
   //! Add copy of segment to front end of index set.
   template <typename Tnew>
   RAJA_INLINE void push_front(Tnew &&val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_FRONT, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_FRONT,
+                  PUSH_COPY);
   }
 
   //! Return total length -- sum of lengths of all segments
@@ -341,7 +341,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   template <typename BODY, typename... ARGS>
   RAJA_HOST_DEVICE void segmentCall(size_t segid,
                                     BODY &&body,
-                                    ARGS &&... args) const
+                                    ARGS &&...args) const
   {
     if (getSegmentTypes()[segid] != T0_TypeId) {
       PARENT::segmentCall(segid,
@@ -762,12 +762,14 @@ namespace type_traits
 
 template <typename T>
 struct is_index_set
-    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet, typename std::decay<T>::type> {
+    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet,
+                                            typename std::decay<T>::type> {
 };
 
 template <typename T>
 struct is_indexset_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy, typename std::decay<T>::type> {
+    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy,
+                                            typename std::decay<T>::type> {
 };
 }  // namespace type_traits
 
diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp
index 543524be01..6db86ce3bb 100644
--- a/include/RAJA/index/IndexSetBuilders.hpp
+++ b/include/RAJA/index/IndexSetBuilders.hpp
@@ -19,13 +19,10 @@
 #define RAJA_IndexSetBuilders_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/index/IndexSet.hpp"
 #include "RAJA/index/ListSegment.hpp"
 #include "RAJA/index/RangeSegment.hpp"
-
 #include "RAJA/util/types.hpp"
-
 #include "camp/resource.hpp"
 
 namespace RAJA
@@ -37,13 +34,13 @@ namespace RAJA
  * \brief Generate an index set with aligned Range segments and List segments,
  *        as needed, from given array of indices.
  *
- *        Routine does no error-checking on argements and assumes 
+ *        Routine does no error-checking on argements and assumes
  *        RAJA::Index_type array contains valid indices.
  *
- *  \param iset reference to index set generated with aligned range segments 
+ *  \param iset reference to index set generated with aligned range segments
  *         and list segments. Method assumes index set is empty (no segments).
- *  \param work_res camp resource object that identifies the memory space in 
- *         which list segment index data will live (passed to list segment 
+ *  \param work_res camp resource object that identifies the memory space in
+ *         which list segment index data will live (passed to list segment
  *         ctor).
  *  \param indices_in pointer to start of input array of indices.
  *  \param length size of input index array.
@@ -79,37 +76,36 @@ void RAJASHAREDDLL_API buildIndexSetAligned(
  ******************************************************************************
  *
  * \brief Generate a lock-free "block" index set (planar division) containing
- *        range segments. 
+ *        range segments.
  *
- *        The method chunks a fastDim x midDim x slowDim mesh into blocks that 
+ *        The method chunks a fastDim x midDim x slowDim mesh into blocks that
  *        can be dependency-scheduled, removing need for lock constructs.
  *
  *  \param iset reference to index set generated with range segments.
- *         Method assumes index set is empty (no segments). 
+ *         Method assumes index set is empty (no segments).
  *  \param fastDim "fast" block dimension (see above).
  *  \param midDim  "mid" block dimension (see above).
  *  \param slowDim "slow" block dimension (see above).
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-    int fastDim,
-    int midDim,
-    int slowDim);
+void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+                                int fastDim,
+                                int midDim,
+                                int slowDim);
 
 /*!
  ******************************************************************************
  *
  * \brief Generate a lock-free "color" index set containing range and list
  *        segments.
- * 
- *        TThe domain-set is colored based on connectivity to the range-set. 
- *        All elements in each segment are independent, and no two segments 
+ *
+ *        TThe domain-set is colored based on connectivity to the range-set.
+ *        All elements in each segment are independent, and no two segments
  *        can be executed in parallel.
  *
- * \param iset reference to index set generated. Method assumes index set 
- *        is empty (no segments). 
+ * \param iset reference to index set generated. Method assumes index set
+ *        is empty (no segments).
  * \param work_res camp resource object that identifies the memory space in
  *         which list segment index data will live (passed to list segment
  *         ctor).
diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp
index 4baea450fc..739ab818fb 100644
--- a/include/RAJA/index/IndexSetUtils.hpp
+++ b/include/RAJA/index/IndexSetUtils.hpp
@@ -20,9 +20,7 @@
 #define RAJA_IndexSetUtils_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/pattern/forall.hpp"
-
 #include "RAJA/policy/sequential.hpp"
 
 namespace RAJA
@@ -31,10 +29,10 @@ namespace RAJA
 //@{
 //!   @name Methods to gather indices of segment or index set into a container.
 //!
-//!   For each method, the given container must be templated on a data type, 
-//!   have default and copy ctors, push_back method, and value_type. Is is 
-//!   assumed that the container data type and segment or index set data type 
-//!   are compatible in the sense that the index set type can be converted to 
+//!   For each method, the given container must be templated on a data type,
+//!   have default and copy ctors, push_back method, and value_type. Is is
+//!   assumed that the container data type and segment or index set data type
+//!   are compatible in the sense that the index set type can be converted to
 //!   the container data type.
 
 /*!
@@ -49,11 +47,8 @@ RAJA_INLINE void getIndices(CONTAINER_T& con,
                             const TypedIndexSet<SEG_TYPES...>& iset)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
-    [&](typename CONTAINER_T::value_type idx) {
-      tcon.push_back(idx);
-    }
-  );
+  forall<ExecPolicy<seq_segit, seq_exec> >(
+      iset, [&](typename CONTAINER_T::value_type idx) { tcon.push_back(idx); });
   con = tcon;
 }
 
@@ -68,11 +63,9 @@ template <typename CONTAINER_T, typename SEGMENT_T>
 RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg,
-    [&](typename CONTAINER_T::value_type idx) {
-      tcon.push_back(idx);
-    }
-  );
+  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx) {
+    tcon.push_back(idx);
+  });
   con = tcon;
 }
 
@@ -90,11 +83,10 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
                                        CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
-    [&](typename CONTAINER_T::value_type idx) {
-      if (conditional(idx)) tcon.push_back(idx);
-    }
-  );
+  forall<ExecPolicy<seq_segit, seq_exec> >(
+      iset, [&](typename CONTAINER_T::value_type idx) {
+        if (conditional(idx)) tcon.push_back(idx);
+      });
   con = tcon;
 }
 
@@ -112,11 +104,9 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
                                        CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg,
-    [&](typename CONTAINER_T::value_type idx) {
-      if (conditional(idx)) tcon.push_back(idx);
-    }
-  );
+  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx) {
+    if (conditional(idx)) tcon.push_back(idx);
+  });
   con = tcon;
 }
 
diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp
index 44fa143445..98fffb104e 100644
--- a/include/RAJA/index/IndexValue.hpp
+++ b/include/RAJA/index/IndexValue.hpp
@@ -18,10 +18,9 @@
 #ifndef RAJA_INDEXVALUE_HPP
 #define RAJA_INDEXVALUE_HPP
 
-#include "RAJA/config.hpp"
-
 #include <string>
 
+#include "RAJA/config.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
 
@@ -334,18 +333,21 @@ constexpr RAJA_HOST_DEVICE RAJA_INLINE
   return val;
 }
 
-namespace internal{
-template<typename FROM, typename Enable = void>
+namespace internal
+{
+template <typename FROM, typename Enable = void>
 struct StripIndexTypeT {
-    using type = FROM;
+  using type = FROM;
 };
 
-template<typename FROM>
-struct StripIndexTypeT<FROM, typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value>::type>
-{
-    using type = typename FROM::value_type;
+template <typename FROM>
+struct StripIndexTypeT<
+    FROM,
+    typename std::enable_if<
+        std::is_base_of<IndexValueBase, FROM>::value>::type> {
+  using type = typename FROM::value_type;
 };
-} // namespace internal
+}  // namespace internal
 
 /*!
  * \brief Strips a strongly typed index to its underlying type
@@ -353,7 +355,7 @@ struct StripIndexTypeT<FROM, typename std::enable_if<std::is_base_of<IndexValueB
  *
  * \param FROM the original type
  */
-template<typename FROM>
+template <typename FROM>
 using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
 
 /*!
@@ -362,12 +364,11 @@ using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
  *
  * \param FROM the original type
  */
-template<typename FROM>
-using make_signed_t = typename std::conditional < 
-                                  std::is_floating_point<FROM>::value,
-                                    std::common_type<FROM>,
-                                    std::make_signed<FROM>
-                               >::type::type;
+template <typename FROM>
+using make_signed_t =
+    typename std::conditional<std::is_floating_point<FROM>::value,
+                              std::common_type<FROM>,
+                              std::make_signed<FROM> >::type::type;
 
 }  // namespace RAJA
 
@@ -397,17 +398,19 @@ using make_signed_t = typename std::conditional <
  * \param IDXT the index types value type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                         \
-  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                 \
-  {                                                                  \
-  public:                                                            \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                              \
-        : RAJA::IndexValue<TYPE,IDXT>::IndexValue() {}               \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)               \
-        : RAJA::IndexValue<TYPE,IDXT>::IndexValue(v)                 \
-    {                                                                \
-    }                                                                \
-    static inline std::string getName() { return NAME; }             \
+#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)             \
+  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>     \
+  {                                                      \
+  public:                                                \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                  \
+        : RAJA::IndexValue<TYPE, IDXT>::IndexValue()     \
+    {                                                    \
+    }                                                    \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)   \
+        : RAJA::IndexValue<TYPE, IDXT>::IndexValue(v)    \
+    {                                                    \
+    }                                                    \
+    static inline std::string getName() { return NAME; } \
   };
 
 #endif
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index adee46053c..4eacc1a55f 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -18,18 +18,16 @@
 #ifndef RAJA_ListSegment_HPP
 #define RAJA_ListSegment_HPP
 
-#include "RAJA/config.hpp"
-
 #include <memory>
 #include <type_traits>
 #include <utility>
 
-#include "camp/resource.hpp"
-
+#include "RAJA/config.hpp"
+#include "RAJA/util/Span.hpp"
 #include "RAJA/util/concepts.hpp"
 #include "RAJA/util/macros.hpp"
-#include "RAJA/util/Span.hpp"
 #include "RAJA/util/types.hpp"
+#include "camp/resource.hpp"
 
 namespace RAJA
 {
@@ -85,7 +83,6 @@ template <typename StorageT>
 class TypedListSegment
 {
 public:
-
   //@{
   //!   @name Types used in implementation based on template parameter.
 
@@ -111,7 +108,7 @@ class TypedListSegment
    * \param values array of indices defining iteration space of segment
    * \param length number of indices
    * \param resource camp resource defining memory space where index data live
-   * \param owned optional enum value indicating whether segment owns indices 
+   * \param owned optional enum value indicating whether segment owns indices
    * (Owned or Unowned). Default is Owned.
    *
    * If 'Unowned' is passed as last argument, the segment will not own its
@@ -121,7 +118,7 @@ class TypedListSegment
                    Index_type length,
                    camp::resources::Resource resource,
                    IndexOwnership owned = Owned)
-    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
+      : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
   {
     initIndexData(values, length, resource, owned);
   }
@@ -141,7 +138,10 @@ class TypedListSegment
   template <typename Container>
   TypedListSegment(const Container& container,
                    camp::resources::Resource resource)
-    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(container.size())
+      : m_resource(nullptr),
+        m_owned(Unowned),
+        m_data(nullptr),
+        m_size(container.size())
   {
     if (m_size > 0) {
 
@@ -164,7 +164,6 @@ class TypedListSegment
       m_owned = Owned;
 
       host_res.deallocate(tmp);
-
     }
   }
 
@@ -175,8 +174,10 @@ class TypedListSegment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment(const TypedListSegment& other)
-    : m_resource(nullptr),
-      m_owned(Unowned), m_data(other.m_data), m_size(other.m_size)
+      : m_resource(nullptr),
+        m_owned(Unowned),
+        m_data(other.m_data),
+        m_size(other.m_size)
   {
   }
 
@@ -192,7 +193,7 @@ class TypedListSegment
     m_size = other.m_size;
   }
 
-    //! move assignment for list segment
+  //! move assignment for list segment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment& operator=(TypedListSegment&& rhs)
@@ -211,8 +212,10 @@ class TypedListSegment
 
   //! Move constructor for list segment
   RAJA_HOST_DEVICE TypedListSegment(TypedListSegment&& rhs)
-    : m_resource(rhs.m_resource),
-      m_owned(rhs.m_owned), m_data(rhs.m_data), m_size(rhs.m_size)
+      : m_resource(rhs.m_resource),
+        m_owned(rhs.m_owned),
+        m_data(rhs.m_data),
+        m_size(rhs.m_size)
   {
     rhs.m_owned = Unowned;
     rhs.m_resource = nullptr;
@@ -221,10 +224,7 @@ class TypedListSegment
   }
 
   //! List segment destructor
-  RAJA_HOST_DEVICE ~TypedListSegment()
-  {
-    clear();
-  }
+  RAJA_HOST_DEVICE ~TypedListSegment() { clear(); }
 
   //! Clear method to be called
   RAJA_HOST_DEVICE void clear()
@@ -357,20 +357,20 @@ class TypedListSegment
     m_owned = container_own;
     if (m_owned == Owned) {
 
-        m_resource = new camp::resources::Resource(resource_);
+      m_resource = new camp::resources::Resource(resource_);
 
-        camp::resources::Resource host_res{camp::resources::Host()};
+      camp::resources::Resource host_res{camp::resources::Host()};
 
-        value_type* tmp = host_res.allocate<value_type>(m_size);
+      value_type* tmp = host_res.allocate<value_type>(m_size);
 
-        for (Index_type i = 0; i < m_size; ++i) {
-          tmp[i] = container[i];
-        }
+      for (Index_type i = 0; i < m_size; ++i) {
+        tmp[i] = container[i];
+      }
 
-        m_data = m_resource->allocate<value_type>(m_size);
-        m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
+      m_data = m_resource->allocate<value_type>(m_size);
+      m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
 
-        host_res.deallocate(tmp);
+      host_res.deallocate(tmp);
 
       return;
     }
@@ -382,7 +382,7 @@ class TypedListSegment
 
 
   // Copy of camp resource passed to ctor
-  camp::resources::Resource *m_resource;
+  camp::resources::Resource* m_resource;
 
   // Ownership flag to guide data copying/management
   IndexOwnership m_owned;
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index a41959c583..6da88b570a 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -18,16 +18,13 @@
 #ifndef RAJA_RangeSegment_HPP
 #define RAJA_RangeSegment_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 
+#include "RAJA/config.hpp"
+#include "RAJA/index/IndexValue.hpp"
 #include "RAJA/internal/Iterators.hpp"
-
 #include "RAJA/util/concepts.hpp"
 
-#include "RAJA/index/IndexValue.hpp"
-
 namespace RAJA
 {
 
@@ -50,10 +47,10 @@ namespace RAJA
  *
  * NOTE: TypedRangeSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of 
+ * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of
  *       indices [-5, 3).
  *
- * NOTE: Proper handling of indices strides requires that StorageT is a 
+ * NOTE: Proper handling of indices strides requires that StorageT is a
  *       signed type.
  *
  * Usage:
@@ -92,15 +89,18 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+template <typename StorageT,
+          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
 struct TypedRangeSegment {
 
-  // 
+  //
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
-  // 
-  static_assert(std::is_signed<DiffT>::value, "TypedRangeSegment DiffT requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeSegment Type must be non floating point.");
+  //
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeSegment Type must be non floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -117,18 +117,18 @@ struct TypedRangeSegment {
   //@}
 
   //@{
-  //!   @name Constructors, destructor, and copy assignment. 
+  //!   @name Constructors, destructor, and copy assignment.
 
   /*!
    * \brief Construct a range segment repreenting the interval [begin, end)
-   * 
+   *
    * \param begin start value (inclusive) for the range
    * \param end end value (exclusive) for the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin, StripStorageT end)
-      : m_begin(iterator(begin)), 
-        m_end(begin > end ? m_begin : iterator(end))
+  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin,
+                                               StripStorageT end)
+      : m_begin(iterator(begin)), m_end(begin > end ? m_begin : iterator(end))
   {
   }
 
@@ -187,7 +187,7 @@ struct TypedRangeSegment {
    * \brief Compare this segment to another for inequality
    *
    * \return true if begin or end does not match, else false
-   */ 
+   */
   RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeSegment const& o) const
   {
     return !(operator==(o));
@@ -198,9 +198,9 @@ struct TypedRangeSegment {
   /*!
    * \brief Get a new TypedRangeSegment instance representing a slice of
    *        existing segment
-   * 
-   * \param begin start iterate of new range 
-   * \param length maximum length of new range 
+   *
+   * \param begin start iterate of new range
+   * \param length maximum length of new range
    * \return TypedRangeSegment representing the interval
    *         [ *begin() + begin, min( *begin() + begin + length, *end() ) )
    *
@@ -213,7 +213,7 @@ struct TypedRangeSegment {
    *     auto r = RAJA::TypedRangeSegment<int>(-4, 4);
    *
    *     // s repreents the subinterval  [-3, 2)
-   *     auto s = r.slice(1, 5); 
+   *     auto s = r.slice(1, 5);
    *
    *   \endverbatim
    */
@@ -247,8 +247,8 @@ struct TypedRangeSegment {
 /*!
  ******************************************************************************
  *
- * \class TypedRangeStrideSegment 
- * 
+ * \class TypedRangeStrideSegment
+ *
  * \brief  Segment class representing a strided range of typed indices
  *
  * \tparam StorageT underlying data type for the segment indices (required)
@@ -264,9 +264,9 @@ struct TypedRangeSegment {
  *
  * NOTE: TypedRangeStrideSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeStrideSegment allows for positive or negative strides and 
- *       indices. This allows for forward (stride > 0) or backward (stride < 0) 
- *       traversal of the iteration space. A stride of zero is undefined and 
+ * NOTE: TypedRangeStrideSegment allows for positive or negative strides and
+ *       indices. This allows for forward (stride > 0) or backward (stride < 0)
+ *       traversal of the iteration space. A stride of zero is undefined and
  *       will cause divide-by-zero errors.
  *
  * As with RangeSegment, the iteration space is inclusive of begin() and
@@ -275,7 +275,7 @@ struct TypedRangeSegment {
  * For positive strides, begin() > end() implies size()==0
  * For negative strides, begin() < end() implies size()==0
  *
- * NOTE: Proper handling of negative strides and indices requires that 
+ * NOTE: Proper handling of negative strides and indices requires that
  *       StorageT is a signed type.
  *
  * Usage:
@@ -321,15 +321,18 @@ struct TypedRangeSegment {
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+template <typename StorageT,
+          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
 struct TypedRangeStrideSegment {
 
   //
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
   //
-  static_assert(std::is_signed<DiffT>::value, "TypedRangeStrideSegment DiffT requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeStrideSegment Type must be non floating point.");
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeStrideSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeStrideSegment Type must be non floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -349,7 +352,7 @@ struct TypedRangeStrideSegment {
   //!   @name Constructors, destructor, and copy assignment.
 
   /*!
-   * \brief Construct a range segment for the interval [begin, end) with 
+   * \brief Construct a range segment for the interval [begin, end) with
    *        given stride
    *
    * \param begin start value (inclusive) for the range
@@ -408,8 +411,8 @@ struct TypedRangeStrideSegment {
 
   /*!
    * \brief Get size of this segment
-   * 
-   * The size is the number of iterates in the 
+   *
+   * The size is the number of iterates in the
    * interval [begin, end) when striding over it
    */
   RAJA_HOST_DEVICE DiffT size() const { return m_size; }
@@ -435,7 +438,8 @@ struct TypedRangeStrideSegment {
    *
    * \return true if begin, end, or size does not match, else false
    */
-  RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeStrideSegment const& o) const
+  RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(
+      TypedRangeStrideSegment const& o) const
   {
     return !(operator==(o));
   }
@@ -450,7 +454,7 @@ struct TypedRangeStrideSegment {
    * \param length maximum length of new range
    *
    * \return TypedRangeStrideSegment representing the interval
-   *         [ *begin() + begin * stride, 
+   *         [ *begin() + begin * stride,
    *           min( *begin() + (begin + length) * stride, *end() )
    *
    * Here's an example of a slice operation on a range segment with a negative
@@ -466,7 +470,7 @@ struct TypedRangeStrideSegment {
    *     //       5 indices in r starting at the 6th entry
    *     auto s = r.slice(6, 6);
    *
-   *   \endverbatim 
+   *   \endverbatim
    */
   RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin,
                                                  DiffT length) const
@@ -549,7 +553,7 @@ RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
 }
 
 /*!
- * \brief Function to make a TypedRangeStride Segment for the interval 
+ * \brief Function to make a TypedRangeStride Segment for the interval
  *        [begin, end) with given stride
  *
  *  \return a newly constructed TypedRangeStrideSegment where
@@ -566,8 +570,11 @@ RAJA_HOST_DEVICE TypedRangeStrideSegment<Common> make_strided_range(
     EndT&& end,
     StrideT&& stride)
 {
-  static_assert(std::is_signed<StrideT>::value, "make_strided_segment : stride must be signed.");
-  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value, "make_stride_segment : stride and end must be of similar types.");
+  static_assert(std::is_signed<StrideT>::value,
+                "make_strided_segment : stride must be signed.");
+  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value,
+                "make_stride_segment : stride and end must be of similar "
+                "types.");
   return {begin, end, stride};
 }
 
diff --git a/include/RAJA/internal/DepGraphNode.hpp b/include/RAJA/internal/DepGraphNode.hpp
index 8feceae22f..3b0fd8b016 100644
--- a/include/RAJA/internal/DepGraphNode.hpp
+++ b/include/RAJA/internal/DepGraphNode.hpp
@@ -19,13 +19,12 @@
 #ifndef RAJA_DepGraphNode_HPP
 #define RAJA_DepGraphNode_HPP
 
-#include "RAJA/config.hpp"
-
 #include <atomic>
 #include <cstdlib>
 #include <iosfwd>
 #include <thread>
 
+#include "RAJA/config.hpp"
 #include "RAJA/util/types.hpp"
 
 namespace RAJA
diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp
index 6f32a56e6d..c127c96def 100644
--- a/include/RAJA/internal/Iterators.hpp
+++ b/include/RAJA/internal/Iterators.hpp
@@ -287,10 +287,14 @@ class strided_numeric_iterator
   using iterator_category = std::random_access_iterator_tag;
 
   constexpr strided_numeric_iterator() noexcept = default;
-  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept = default;
-  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept = default;
-  strided_numeric_iterator& operator=(const strided_numeric_iterator&) noexcept = default;
-  strided_numeric_iterator& operator=(strided_numeric_iterator&&) noexcept = default;
+  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept =
+      default;
+  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept =
+      default;
+  strided_numeric_iterator& operator=(
+      const strided_numeric_iterator&) noexcept = default;
+  strided_numeric_iterator& operator=(strided_numeric_iterator&&) noexcept =
+      default;
 
   RAJA_HOST_DEVICE constexpr strided_numeric_iterator(
       stripped_value_type rhs,
diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp
index 55015f9ab7..19df756191 100644
--- a/include/RAJA/internal/MemUtils_CPU.hpp
+++ b/include/RAJA/internal/MemUtils_CPU.hpp
@@ -19,12 +19,11 @@
 #ifndef RAJA_MemUtils_CPU_HPP
 #define RAJA_MemUtils_CPU_HPP
 
-#include "RAJA/config.hpp"
-
 #include <cstddef>
 #include <cstdlib>
 #include <memory>
 
+#include "RAJA/config.hpp"
 #include "RAJA/util/types.hpp"
 
 #if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || \
@@ -95,27 +94,22 @@ inline void free_aligned(void* ptr)
 ///
 /// Deleter function object for memory allocated with allocate_aligned
 ///
-struct FreeAligned
-{
-  void operator()(void* ptr)
-  {
-    free_aligned(ptr);
-  }
+struct FreeAligned {
+  void operator()(void* ptr) { free_aligned(ptr); }
 };
 
 ///
 /// Deleter function object for memory allocated with allocate_aligned_type
 /// that calls the destructor for the fist size objects in the storage.
 ///
-template < typename T, typename index_type >
-struct FreeAlignedType : FreeAligned
-{
+template <typename T, typename index_type>
+struct FreeAlignedType : FreeAligned {
   index_type size = 0;
 
   void operator()(T* ptr)
   {
-    for ( index_type i = size; i > 0; --i ) {
-      ptr[i-1].~T();
+    for (index_type i = size; i > 0; --i) {
+      ptr[i - 1].~T();
     }
     FreeAligned::operator()(ptr);
   }
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index 1d0ec0cbeb..a97ceb5b7b 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -19,13 +19,12 @@
 #ifndef RAJAVec_HPP
 #define RAJAVec_HPP
 
-#include "RAJA/config.hpp"
-
 #include <algorithm>
 #include <cstddef>
 #include <memory>
 #include <utility>
 
+#include "RAJA/config.hpp"
 #include "RAJA/internal/MemUtils_CPU.hpp"
 
 namespace RAJA
@@ -57,8 +56,9 @@ class RAJAVec
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
+
 public:
   using value_type = T;
   using allocator_type = Allocator;
@@ -86,7 +86,9 @@ class RAJAVec
   ///
   RAJAVec(const RAJAVec& other)
       : m_data(nullptr),
-        m_allocator(allocator_traits_type::select_on_container_copy_construction(other.m_allocator)),
+        m_allocator(
+            allocator_traits_type::select_on_container_copy_construction(
+                other.m_allocator)),
         m_capacity(0),
         m_size(0)
   {
@@ -125,7 +127,8 @@ class RAJAVec
   RAJAVec& operator=(RAJAVec&& rhs)
   {
     if (&rhs != this) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -150,25 +153,25 @@ class RAJAVec
   ///
   /// Get a pointer to the beginning of the contiguous vector
   ///
-        pointer data()       { return m_data; }
+  pointer data() { return m_data; }
   ///
   const_pointer data() const { return m_data; }
 
   ///
   /// Get an iterator to the end.
   ///
-        iterator  end()       { return m_data + m_size; }
+  iterator end() { return m_data + m_size; }
   ///
-  const_iterator  end() const { return m_data + m_size; }
+  const_iterator end() const { return m_data + m_size; }
   ///
   const_iterator cend() const { return m_data + m_size; }
 
   ///
   /// Get an iterator to the beginning.
   ///
-        iterator  begin()       { return m_data; }
+  iterator begin() { return m_data; }
   ///
-  const_iterator  begin() const { return m_data; }
+  const_iterator begin() const { return m_data; }
   ///
   const_iterator cbegin() const { return m_data; }
 
@@ -200,18 +203,12 @@ class RAJAVec
   ///
   /// Shrink the capacity of the vector to the current size.
   ///
-  void shrink_to_fit()
-  {
-    shrink_cap(m_size);
-  }
+  void shrink_to_fit() { shrink_cap(m_size); }
 
   ///
   /// Empty vector of all data.
   ///
-  void clear()
-  {
-    destroy_items_after(0);
-  }
+  void clear() { destroy_items_after(0); }
 
   ///
   /// Change the size of the vector,
@@ -248,23 +245,23 @@ class RAJAVec
   ///
   /// Bracket operator accessor.
   ///
-        reference operator[](difference_type i)       { return m_data[i]; }
+  reference operator[](difference_type i) { return m_data[i]; }
   ///
   const_reference operator[](difference_type i) const { return m_data[i]; }
 
   ///
   /// Access the last item of the vector.
   ///
-        reference front()       { return m_data[0]; }
+  reference front() { return m_data[0]; }
   ///
   const_reference front() const { return m_data[0]; }
 
   ///
   /// Access the last item of the vector.
   ///
-        reference back()       { return m_data[m_size-1]; }
+  reference back() { return m_data[m_size - 1]; }
   ///
-  const_reference back() const { return m_data[m_size-1]; }
+  const_reference back() const { return m_data[m_size - 1]; }
 
   ///
   /// Add item to front end of vector. Note that this operation is unique to
@@ -272,28 +269,31 @@ class RAJAVec
   ///
   void push_front(const_reference item) { emplace_front_private(item); }
   ///
-  void push_front(   value_type&& item) { emplace_front_private(std::move(item)); }
+  void push_front(value_type&& item) { emplace_front_private(std::move(item)); }
   ///
-  template < typename ... Os >
-  void emplace_front(Os&&... os) { emplace_front_private(std::forward<Os>(os)...); }
+  template <typename... Os>
+  void emplace_front(Os&&... os)
+  {
+    emplace_front_private(std::forward<Os>(os)...);
+  }
 
   ///
   /// Add item to back end of vector.
   ///
   void push_back(const_reference item) { emplace_back_private(item); }
   ///
-  void push_back(   value_type&& item) { emplace_back_private(std::move(item)); }
+  void push_back(value_type&& item) { emplace_back_private(std::move(item)); }
   ///
-  template < typename ... Os >
-  void emplace_back(Os&&... os) { emplace_back_private(std::forward<Os>(os)...); }
+  template <typename... Os>
+  void emplace_back(Os&&... os)
+  {
+    emplace_back_private(std::forward<Os>(os)...);
+  }
 
   ///
   /// Remove the last item of the vector.
   ///
-  void pop_back()
-  {
-    destroy_items_after(m_size-1);
-  }
+  void pop_back() { destroy_items_after(m_size - 1); }
 
 private:
   pointer m_data;
@@ -386,10 +386,10 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::true_type)
   {
     using std::swap;
-    swap(m_data,      other.m_data);
+    swap(m_data, other.m_data);
     swap(m_allocator, other.m_allocator);
-    swap(m_capacity,  other.m_capacity);
-    swap(m_size,      other.m_size);
+    swap(m_capacity, other.m_capacity);
+    swap(m_size, other.m_size);
   }
 
   ///
@@ -398,9 +398,9 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::false_type)
   {
     using std::swap;
-    swap(m_data,      other.m_data);
-    swap(m_capacity,  other.m_capacity);
-    swap(m_size,      other.m_size);
+    swap(m_data, other.m_data);
+    swap(m_capacity, other.m_capacity);
+    swap(m_size, other.m_size);
   }
 
   //
@@ -426,11 +426,13 @@ class RAJAVec
   //
   // Construct items [m_size, new_size) from args.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void construct_items_back(size_type new_size, Os&&... os)
   {
     for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
+      allocator_traits_type::construct(m_allocator,
+                                       m_data + m_size,
+                                       std::forward<Os>(os)...);
     }
   }
 
@@ -440,7 +442,9 @@ class RAJAVec
   void copy_construct_items_back(size_type new_size, const_pointer o_data)
   {
     for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, o_data[m_size]);
+      allocator_traits_type::construct(m_allocator,
+                                       m_data + m_size,
+                                       o_data[m_size]);
     }
   }
 
@@ -450,7 +454,9 @@ class RAJAVec
   void move_construct_items_back(size_type new_size, pointer o_data)
   {
     for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, std::move(o_data[m_size]));
+      allocator_traits_type::construct(m_allocator,
+                                       m_data + m_size,
+                                       std::move(o_data[m_size]));
     }
   }
 
@@ -460,38 +466,44 @@ class RAJAVec
   void destroy_items_after(size_type new_end)
   {
     for (; m_size > new_end; --m_size) {
-      allocator_traits_type::destroy(m_allocator, m_data+m_size-1);
+      allocator_traits_type::destroy(m_allocator, m_data + m_size - 1);
     }
   }
 
   //
   // Add an item to the front, shifting all existing items back one.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void emplace_front_private(Os&&... os)
   {
     reserve(m_size + 1);
 
     if (m_size > 0) {
       size_type i = m_size;
-      allocator_traits_type::construct(m_allocator, m_data+i, std::move(m_data[i - 1]));
+      allocator_traits_type::construct(m_allocator,
+                                       m_data + i,
+                                       std::move(m_data[i - 1]));
       for (--i; i > 0; --i) {
         m_data[i] = std::move(m_data[i - 1]);
       }
       allocator_traits_type::destroy(m_allocator, m_data);
     }
-    allocator_traits_type::construct(m_allocator, m_data, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator,
+                                     m_data,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
   //
   // Add an item to the back.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void emplace_back_private(Os&&... os)
   {
     reserve(m_size + 1);
-    allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator,
+                                     m_data + m_size,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -548,8 +560,10 @@ class RAJAVec
 
     if (m_data) {
       for (size_type i = 0; i < m_size; ++i) {
-        allocator_traits_type::construct(m_allocator, tdata+i, std::move(m_data[i]));
-        allocator_traits_type::destroy(m_allocator, m_data+i);
+        allocator_traits_type::construct(m_allocator,
+                                         tdata + i,
+                                         std::move(m_data[i]));
+        allocator_traits_type::destroy(m_allocator, m_data + i);
       }
       allocator_traits_type::deallocate(m_allocator, m_data, m_capacity);
     }
diff --git a/include/RAJA/internal/fault_tolerance.hpp b/include/RAJA/internal/fault_tolerance.hpp
index cf3a86cede..baa4941b38 100644
--- a/include/RAJA/internal/fault_tolerance.hpp
+++ b/include/RAJA/internal/fault_tolerance.hpp
@@ -35,6 +35,7 @@
 #if defined(RAJA_REPORT_FT)
 
 #include <stdio.h>
+
 #include "cycle.h"
 
 #define RAJA_FT_BEGIN                          \
@@ -80,17 +81,20 @@
     do {                          \
       repeat = false;
 
-#define RAJA_FT_END        \
-  if (fault_type > 0) {    \
-    /* invalidate cache */ \
-    repeat = true;         \
-    fault_type = 0;        \
-  }                        \
-  }                        \
-  while (repeat == true)   \
-    ;                      \
-  }                        \
-  else { fault_type = 0; /* ignore for the simulation */ }
+#define RAJA_FT_END                                 \
+  if (fault_type > 0) {                             \
+    /* invalidate cache */                          \
+    repeat = true;                                  \
+    fault_type = 0;                                 \
+  }                                                 \
+  }                                                 \
+  while (repeat == true)                            \
+    ;                                               \
+  }                                                 \
+  else                                              \
+  {                                                 \
+    fault_type = 0; /* ignore for the simulation */ \
+  }
 
 #endif  // RAJA_REPORT_FT
 
diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp
index af65c05392..02a5fa3a2b 100644
--- a/include/RAJA/internal/foldl.hpp
+++ b/include/RAJA/internal/foldl.hpp
@@ -18,17 +18,15 @@
 #ifndef RAJA_foldl_HPP
 #define RAJA_foldl_HPP
 
-#include "RAJA/config.hpp"
-
 #include <cstdint>
 #include <functional>
 #include <iostream>
 #include <type_traits>
 #include <utility>
 
-#include "camp/camp.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/util/macros.hpp"
+#include "camp/camp.hpp"
 
 
 namespace RAJA
@@ -61,11 +59,13 @@ template <typename Op,
           typename Arg3,
           typename... Rest>
 struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
-  using Ret = typename foldl_impl<
-      Op,
-      typename std::invoke_result<Op, typename std::invoke_result<Op, Arg1, Arg2>::type,
-                                      Arg3>::type,
-      Rest...>::Ret;
+  using Ret =
+      typename foldl_impl<Op,
+                          typename std::invoke_result<
+                              Op,
+                              typename std::invoke_result<Op, Arg1, Arg2>::type,
+                              Arg3>::type,
+                          Rest...>::Ret;
 };
 
 #else
@@ -90,7 +90,7 @@ struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
 
 #endif
 
-} // namespace detail
+}  // namespace detail
 
 template <typename Op, typename Arg1>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(
diff --git a/include/RAJA/internal/get_platform.hpp b/include/RAJA/internal/get_platform.hpp
index 0354d04bfd..ccf233c72a 100644
--- a/include/RAJA/internal/get_platform.hpp
+++ b/include/RAJA/internal/get_platform.hpp
@@ -1,22 +1,24 @@
 #ifndef RAJA_get_platform_HPP
 #define RAJA_get_platform_HPP
 
-#include "RAJA/util/Operators.hpp"
 #include "RAJA/internal/foldl.hpp"
 #include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/util/Operators.hpp"
 
 namespace RAJA
 {
 
-namespace policy {
-namespace multi {
+namespace policy
+{
+namespace multi
+{
 template <typename Selector, typename... Policies>
 class MultiPolicy;
 
 }
-}
+}  // namespace policy
 
-namespace detail 
+namespace detail
 {
 
 struct max_platform {
@@ -66,11 +68,11 @@ struct get_platform_from_list<> {
  * (not for MultiPolicy or nested::Policy)
  */
 template <typename T>
-struct get_platform<T,
-                    typename std::
-                        enable_if<std::is_base_of<RAJA::PolicyBase, T>::value
-                                  && !RAJA::type_traits::is_indexset_policy<T>::
-                                         value>::type> {
+struct get_platform<
+    T,
+    typename std::enable_if<
+        std::is_base_of<RAJA::PolicyBase, T>::value &&
+        !RAJA::type_traits::is_indexset_policy<T>::value>::type> {
 
   static constexpr Platform value = T::platform;
 };
@@ -124,7 +126,7 @@ struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>> {
   static constexpr Platform value = Platform::undefined;
 };
 
-} // closing brace for detail namespace
-} // closing brace for RAJA namespace
+}  // namespace detail
+}  // namespace RAJA
 
-#endif // RAJA_get_platform_HPP
+#endif  // RAJA_get_platform_HPP
diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp
index 767821b8d8..dd1fff40c6 100644
--- a/include/RAJA/pattern/WorkGroup.hpp
+++ b/include/RAJA/pattern/WorkGroup.hpp
@@ -19,11 +19,9 @@
 #define RAJA_PATTERN_WorkGroup_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/pattern/WorkGroup/WorkStorage.hpp"
-#include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
-
 #include "RAJA/internal/get_platform.hpp"
+#include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+#include "RAJA/pattern/WorkGroup/WorkStorage.hpp"
 #include "RAJA/util/plugins.hpp"
 
 namespace RAJA
@@ -38,38 +36,42 @@ namespace RAJA
  *
  * \verbatim
 
-   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> pool(allocator);
+   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator>
+ pool(allocator);
 
    pool.enqueue(..., [=] (Index_type i, int* xarg0, int xarg1) {
       xarg0[i] = xarg1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group =
+ pool.instantiate();
 
    int* xarg0 = ...;
    int xarg1 = ...;
-   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site = group.run(xarg0, xarg1);
+   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site =
+ group.run(xarg0, xarg1);
 
  * \endverbatim
  *
  ******************************************************************************
  */
-template < typename ... Args >
+template <typename... Args>
 using xargs = camp::list<Args...>;
 
-namespace detail {
+namespace detail
+{
 
-template < typename T >
+template <typename T>
 struct is_xargs {
   static constexpr bool value = false;
 };
 
-template < typename ... Args >
+template <typename... Args>
 struct is_xargs<xargs<Args...>> {
   static constexpr bool value = true;
 };
 
-}
+}  // namespace detail
 
 
 //
@@ -102,7 +104,8 @@ struct is_xargs<xargs<Args...>> {
       data[i] = 1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
+ pool.instantiate();
 
  * \endverbatim
  *
@@ -113,10 +116,11 @@ template <typename WORKGROUP_POLICY_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
 struct WorkPool {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkPool: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 /*!
@@ -135,9 +139,11 @@ struct WorkPool {
  *
  * \verbatim
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
+ pool.instantiate();
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
+ group.run();
 
  * \endverbatim
  *
@@ -148,10 +154,11 @@ template <typename WORKGROUP_POLICY_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
 struct WorkGroup {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkGroup: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 /*!
@@ -170,7 +177,8 @@ struct WorkGroup {
  *
  * \verbatim
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
+ group.run();
 
    site.synchronize();
 
@@ -183,10 +191,11 @@ template <typename WORKGROUP_POLICY_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
 struct WorkSite {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkSite: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 
@@ -195,7 +204,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
@@ -203,13 +212,15 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                                 DISPATCH_POLICY_T>,
                 INDEX_T,
                 xargs<Args...>,
-                ALLOCATOR_T>
-{
+                ALLOCATOR_T> {
   using exec_policy = EXEC_POLICY_T;
   using order_policy = ORDER_POLICY_T;
   using storage_policy = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
+  using policy = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
   using index_type = INDEX_T;
   using xarg_type = xargs<Args...>;
   using Allocator = ALLOCATOR_T;
@@ -218,10 +229,16 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
   using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using workrunner_type = detail::WorkRunner<
-      exec_policy, order_policy, dispatch_policy, Allocator, index_type, Args...>;
-  using storage_type = detail::WorkStorage<
-      storage_policy, Allocator, typename workrunner_type::dispatcher_type>;
+  using workrunner_type = detail::WorkRunner<exec_policy,
+                                             order_policy,
+                                             dispatch_policy,
+                                             Allocator,
+                                             index_type,
+                                             Args...>;
+  using storage_type =
+      detail::WorkStorage<storage_policy,
+                          Allocator,
+                          typename workrunner_type::dispatcher_type>;
 
   friend workgroup_type;
   friend worksite_type;
@@ -229,9 +246,7 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workrunner_type::resource_type;
 
-  explicit WorkPool(Allocator const& aloc)
-    : m_storage(aloc)
-  { }
+  explicit WorkPool(Allocator const& aloc) : m_storage(aloc) {}
 
   WorkPool(WorkPool const&) = delete;
   WorkPool& operator=(WorkPool const&) = delete;
@@ -239,27 +254,22 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
   WorkPool(WorkPool&&) = default;
   WorkPool& operator=(WorkPool&&) = default;
 
-  size_t num_loops() const
-  {
-    return m_storage.size();
-  }
+  size_t num_loops() const { return m_storage.size(); }
 
-  size_t storage_bytes() const
-  {
-    return m_storage.storage_size();
-  }
+  size_t storage_bytes() const { return m_storage.storage_size(); }
 
   void reserve(size_t num_loops, size_t storage_bytes)
   {
     m_storage.reserve(num_loops, storage_bytes);
   }
 
-  template < typename segment_T, typename loop_T >
+  template <typename segment_T, typename loop_T>
   inline void enqueue(segment_T&& seg, loop_T&& loop_body)
   {
     {
       // ignore zero length loops
-      using std::begin; using std::end;
+      using std::begin;
+      using std::end;
       if (begin(seg) == end(seg)) return;
     }
     if (m_storage.begin() == m_storage.end()) {
@@ -273,8 +283,7 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
     using RAJA::util::trigger_updates_before;
     auto body = trigger_updates_before(loop_body);
 
-    m_runner.enqueue(
-        m_storage, std::forward<segment_T>(seg), std::move(body));
+    m_runner.enqueue(m_storage, std::forward<segment_T>(seg), std::move(body));
 
     util::callPostCapturePlugins(context);
   }
@@ -289,10 +298,7 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkPool()
-  {
-    clear();
-  }
+  ~WorkPool() { clear(); }
 
 private:
   storage_type m_storage;
@@ -307,7 +313,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                                  ORDER_POLICY_T,
@@ -315,13 +321,15 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                                  DISPATCH_POLICY_T>,
                  INDEX_T,
                  xargs<Args...>,
-                 ALLOCATOR_T>
-{
+                 ALLOCATOR_T> {
   using exec_policy = EXEC_POLICY_T;
   using order_policy = ORDER_POLICY_T;
   using storage_policy = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
+  using policy = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
   using index_type = INDEX_T;
   using xarg_type = xargs<Args...>;
   using Allocator = ALLOCATOR_T;
@@ -347,7 +355,8 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
 
   inline worksite_type run(resource_type r, Args...);
 
-  worksite_type run(Args... args) {
+  worksite_type run(Args... args)
+  {
     auto r = resource_type::get_default();
     return run(r, std::move(args)...);
   }
@@ -360,19 +369,16 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkGroup()
-  {
-    clear();
-  }
+  ~WorkGroup() { clear(); }
 
 private:
   storage_type m_storage;
   workrunner_type m_runner;
 
   WorkGroup(storage_type&& storage, workrunner_type&& runner)
-    : m_storage(std::move(storage))
-    , m_runner(std::move(runner))
-  { }
+      : m_storage(std::move(storage)), m_runner(std::move(runner))
+  {
+  }
 };
 
 template <typename EXEC_POLICY_T,
@@ -380,7 +386,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
@@ -388,13 +394,15 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                                 DISPATCH_POLICY_T>,
                 INDEX_T,
                 xargs<Args...>,
-                ALLOCATOR_T>
-{
+                ALLOCATOR_T> {
   using exec_policy = EXEC_POLICY_T;
   using order_policy = ORDER_POLICY_T;
   using storage_policy = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
+  using policy = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
   using index_type = INDEX_T;
   using xarg_type = xargs<Args...>;
   using Allocator = ALLOCATOR_T;
@@ -418,10 +426,7 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
   WorkSite(WorkSite&&) = default;
   WorkSite& operator=(WorkSite&&) = default;
 
-  resource_type get_resource() const
-  {
-    return m_resource;
-  }
+  resource_type get_resource() const { return m_resource; }
 
   void clear()
   {
@@ -429,19 +434,16 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
     // TODO: synchronize
   }
 
-  ~WorkSite()
-  {
-    clear();
-  }
+  ~WorkSite() { clear(); }
 
 private:
   per_run_storage m_run_storage;
   resource_type m_resource;
 
   explicit WorkSite(resource_type r, per_run_storage&& run_storage)
-    : m_run_storage(std::move(run_storage))
-    , m_resource(r)
-  { }
+      : m_run_storage(std::move(run_storage)), m_resource(r)
+  {
+  }
 };
 
 
@@ -450,19 +452,22 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
-inline
-typename WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::workgroup_type
-WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::instantiate()
+inline typename WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                                         ORDER_POLICY_T,
+                                         STORAGE_POLICY_T,
+                                         DISPATCH_POLICY_T>,
+                         INDEX_T,
+                         xargs<Args...>,
+                         ALLOCATOR_T>::workgroup_type
+WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                         ORDER_POLICY_T,
+                         STORAGE_POLICY_T,
+                         DISPATCH_POLICY_T>,
+         INDEX_T,
+         xargs<Args...>,
+         ALLOCATOR_T>::instantiate()
 {
   // update max sizes to auto-reserve on reuse
   m_max_num_loops = std::max(m_storage.size(), m_max_num_loops);
@@ -477,30 +482,37 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
-inline
-typename WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::worksite_type
+inline typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                          ORDER_POLICY_T,
+                                          STORAGE_POLICY_T,
+                                          DISPATCH_POLICY_T>,
+                          INDEX_T,
+                          xargs<Args...>,
+                          ALLOCATOR_T>::worksite_type
 WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
+    WorkGroupPolicy<EXEC_POLICY_T,
+                    ORDER_POLICY_T,
+                    STORAGE_POLICY_T,
+                    DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
-    ALLOCATOR_T>::run(typename WorkGroup<
-                          WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-                          INDEX_T,
-                          xargs<Args...>,
-                          ALLOCATOR_T>::resource_type r,
+    ALLOCATOR_T>::run(typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                                         ORDER_POLICY_T,
+                                                         STORAGE_POLICY_T,
+                                                         DISPATCH_POLICY_T>,
+                                         INDEX_T,
+                                         xargs<Args...>,
+                                         ALLOCATOR_T>::resource_type r,
                       Args... args)
 {
   util::PluginContext context{util::make_context<EXEC_POLICY_T>()};
   util::callPreLaunchPlugins(context);
 
   // move any per run storage into worksite
-  worksite_type site(r, m_runner.run(m_storage, r, std::forward<Args>(args)...));
+  worksite_type site(r,
+                     m_runner.run(m_storage, r, std::forward<Args>(args)...));
 
   util::callPostLaunchPlugins(context);
 
diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
index 1eac283f4b..67ed9bccb3 100644
--- a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
@@ -19,15 +19,13 @@
 #define RAJA_PATTERN_WORKGROUP_Dispatcher_HPP
 
 
-#include "RAJA/config.hpp"
+#include <utility>
 
+#include "RAJA/config.hpp"
 #include "RAJA/policy/WorkGroup.hpp"
-
-#include "camp/number.hpp"
-#include "camp/list.hpp"
 #include "camp/helpers.hpp"
-
-#include <utility>
+#include "camp/list.hpp"
+#include "camp/number.hpp"
 
 
 namespace RAJA
@@ -36,35 +34,34 @@ namespace RAJA
 namespace detail
 {
 
-template < typename >
-struct DispatcherVoidPtrWrapper
-{
+template <typename>
+struct DispatcherVoidPtrWrapper {
   void* ptr;
   DispatcherVoidPtrWrapper() = default;
   // implicit constructor from void*
-  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) { }
+  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) {}
 };
 
-template < typename >
-struct DispatcherVoidConstPtrWrapper
-{
+template <typename>
+struct DispatcherVoidConstPtrWrapper {
   const void* ptr;
   DispatcherVoidConstPtrWrapper() = default;
   // implicit constructor from const void*
-  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) { }
+  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) {}
 };
 
 
-constexpr bool dispatcher_use_host_invoke(Platform platform) {
+constexpr bool dispatcher_use_host_invoke(Platform platform)
+{
   return !(platform == Platform::cuda || platform == Platform::hip);
 }
 
 // Transforms one dispatch policy into another by creating a dispatch policy
 // of holder_type objects. See usage in WorkRunner for more explanation.
-template < typename dispatch_policy, typename holder_type >
+template <typename dispatch_policy, typename holder_type>
 struct dispatcher_transform_types;
 ///
-template < typename dispatch_policy, typename holder_type >
+template <typename dispatch_policy, typename holder_type>
 using dispatcher_transform_types_t =
     typename dispatcher_transform_types<dispatch_policy, holder_type>::type;
 
@@ -75,12 +72,16 @@ using dispatcher_transform_types_t =
  * DispatcherID is used to differentiate function pointers based on their
  * function signature.
  */
-template < Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
+template <Platform platform,
+          typename dispatch_policy,
+          typename DispatcherID,
+          typename... CallArgs>
 struct Dispatcher;
 
 
-template < typename holder_type >
-struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holder_type> {
+template <typename holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch,
+                                  holder_type> {
   using type = ::RAJA::indirect_function_call_dispatch;
 };
 
@@ -93,8 +94,11 @@ struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holde
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_function_call_dispatch,
+                  DispatcherID,
+                  CallArgs...> {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy = ::RAJA::indirect_function_call_dispatch;
   using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
@@ -104,27 +108,29 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  template < typename T >
-  static void s_move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src)
+  template <typename T>
+  static void s_move_construct_destroy(void_ptr_wrapper dest,
+                                       void_ptr_wrapper src)
   {
     T* dest_as_T = static_cast<T*>(dest.ptr);
     T* src_as_T = static_cast<T*>(src.ptr);
-    new(dest_as_T) T(std::move(*src_as_T));
+    new (dest_as_T) T(std::move(*src_as_T));
     (*src_as_T).~T();
   }
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  template < typename T >
+  template <typename T>
   static void s_host_invoke(void_cptr_wrapper obj, CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
   }
   ///
-  template < typename T >
-  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj, CallArgs... args)
+  template <typename T>
+  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj,
+                                          CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
@@ -133,22 +139,25 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   ///
   /// destroy the object of type T in obj
   ///
-  template < typename T >
+  template <typename T>
   static void s_destroy(void_ptr_wrapper obj)
   {
     T* obj_as_T = static_cast<T*>(obj.ptr);
     (*obj_as_T).~T();
   }
 
-  using mover_type = void(*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/);
-  using invoker_type = void(*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/);
-  using destroyer_type = void(*)(void_ptr_wrapper /*obj*/);
+  using mover_type = void (*)(void_ptr_wrapper /*dest*/,
+                              void_ptr_wrapper /*src*/);
+  using invoker_type = void (*)(void_cptr_wrapper /*obj*/,
+                                CallArgs... /*args*/);
+  using destroyer_type = void (*)(void_ptr_wrapper /*obj*/);
 
   // This can't be a cuda device lambda due to compiler limitations
-  template < typename T >
+  template <typename T>
   struct DeviceInvokerFactory {
     using value_type = invoker_type;
-    RAJA_DEVICE value_type operator()() {
+    RAJA_DEVICE value_type operator()()
+    {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -160,14 +169,15 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    return { mover_type{&s_move_construct_destroy<T>},
-             invoker_type{&s_host_invoke<T>},
-             destroyer_type{&s_destroy<T>},
-             sizeof(T)
-           };
+  template <typename T,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    return {mover_type{&s_move_construct_destroy<T>},
+            invoker_type{&s_host_invoke<T>},
+            destroyer_type{&s_destroy<T>},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -179,14 +189,17 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
-    return { mover_type{&s_move_construct_destroy<T>},
-             invoker_type{std::forward<CreateOnDevice>(createOnDevice)(DeviceInvokerFactory<T>{})},
-             destroyer_type{&s_destroy<T>},
-             sizeof(T)
-           };
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
+  {
+    return {mover_type{&s_move_construct_destroy<T>},
+            invoker_type{std::forward<CreateOnDevice>(createOnDevice)(
+                DeviceInvokerFactory<T>{})},
+            destroyer_type{&s_destroy<T>},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -196,8 +209,9 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
 };
 
 
-template < typename holder_type >
-struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, holder_type> {
+template <typename holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch,
+                                  holder_type> {
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
 
@@ -210,15 +224,19 @@ struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, ho
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_virtual_function_dispatch,
+                  DispatcherID,
+                  CallArgs...> {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy = ::RAJA::indirect_virtual_function_dispatch;
   using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   struct impl_base {
-    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const = 0;
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const = 0;
     virtual void destroy(void_ptr_wrapper obj) const = 0;
   };
 
@@ -227,21 +245,22 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   };
 
   struct device_impl_base {
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const = 0;
   };
 
-  template < typename T >
-  struct base_impl_type : impl_base
-  {
+  template <typename T>
+  struct base_impl_type : impl_base {
     ///
     /// move construct an object of type T in dest as a copy of a T from src and
     /// destroy the T obj in src
     ///
-    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const override
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const override
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
       T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
 
@@ -255,9 +274,8 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  template < typename T >
-  struct host_impl_type : host_impl_base
-  {
+  template <typename T>
+  struct host_impl_type : host_impl_base {
     ///
     /// invoke the call operator of the object of type T in obj with args
     ///
@@ -268,13 +286,13 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  template < typename T >
-  struct device_impl_type : device_impl_base
-  {
+  template <typename T>
+  struct device_impl_type : device_impl_base {
     ///
     /// invoke the call operator of the object of type T in obj with args
     ///
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const override
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const override
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
@@ -304,23 +322,20 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
       m_impl->invoke(obj, std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   struct destroyer_type {
     impl_base* m_impl;
-    void operator()(void_ptr_wrapper obj) const
-    {
-      m_impl->destroy(obj);
-    }
+    void operator()(void_ptr_wrapper obj) const { m_impl->destroy(obj); }
   };
 
   // This can't be a cuda device lambda due to compiler limitations
-  template < typename T >
+  template <typename T>
   struct DeviceImplTypeFactory {
     using value_type = device_impl_type<T>*;
-    RAJA_DEVICE value_type operator()() {
+    RAJA_DEVICE value_type operator()()
+    {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -333,16 +348,17 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
+  template <typename T,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
     static base_impl_type<T> s_base_impl;
     static host_impl_type<T> s_host_impl;
-    return { mover_type{&s_base_impl},
-             host_invoker_type{&s_host_impl},
-             destroyer_type{&s_base_impl},
-             sizeof(T)
-           };
+    return {mover_type{&s_base_impl},
+            host_invoker_type{&s_host_impl},
+            destroyer_type{&s_base_impl},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -354,17 +370,19 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
+  {
     static base_impl_type<T> s_base_impl;
-    static device_impl_type<T>* s_device_impl_ptr{
-        std::forward<CreateOnDevice>(createOnDevice)(DeviceImplTypeFactory<T>{}) };
-    return { mover_type{&s_base_impl},
-             device_invoker_type{s_device_impl_ptr},
-             destroyer_type{&s_base_impl},
-             sizeof(T)
-           };
+    static device_impl_type<T>* s_device_impl_ptr{std::forward<CreateOnDevice>(
+        createOnDevice)(DeviceImplTypeFactory<T>{})};
+    return {mover_type{&s_base_impl},
+            device_invoker_type{s_device_impl_ptr},
+            destroyer_type{&s_base_impl},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -375,17 +393,21 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
 
 
 // direct_dispatch expects a list of types
-template < typename ... Ts, typename holder_type >
+template <typename... Ts, typename holder_type>
 struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type> {
-  using type = ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
+  using type =
+      ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
 };
 
 /*!
  * Version of Dispatcher that does direct dispatch to zero callable types.
  * It implements the interface with callable objects.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<>,
+                  DispatcherID,
+                  CallArgs...> {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy = ::RAJA::direct_dispatch<>;
   using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
@@ -396,39 +418,36 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...
   /// destroy the T obj in src
   ///
   struct mover_type {
-    void operator()(void_ptr_wrapper, void_ptr_wrapper) const
-    { }
+    void operator()(void_ptr_wrapper, void_ptr_wrapper) const {}
   };
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
   struct host_invoker_type {
-    void operator()(void_cptr_wrapper, CallArgs...) const
-    { }
+    void operator()(void_cptr_wrapper, CallArgs...) const {}
   };
   struct device_invoker_type {
-    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const
-    { }
+    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const {}
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
   struct destroyer_type {
-    void operator()(void_ptr_wrapper) const
-    { }
+    void operator()(void_ptr_wrapper) const {}
   };
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
+  template <typename T,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
     return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
   ///
@@ -437,9 +456,12 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
     return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
 
@@ -453,8 +475,14 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...
  * Version of Dispatcher that does direct dispatch to a single callable type.
  * It implements the interface with callable objects.
  */
-template < Platform platform, typename T, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs...> {
+template <Platform platform,
+          typename T,
+          typename DispatcherID,
+          typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T>,
+                  DispatcherID,
+                  CallArgs...> {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy = ::RAJA::direct_dispatch<T>;
   using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
@@ -469,7 +497,7 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
       T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -491,9 +519,8 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
@@ -509,10 +536,13 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename U,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
+  template <typename U,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
     return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
   ///
@@ -521,10 +551,14 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename U, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
+  template <typename U,
+            typename CreateOnDevice,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
     return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
 
@@ -538,17 +572,23 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
  * Version of Dispatcher that does direct dispatch to multiple callable types.
  * It implements the interface with callable objects.
  */
-template < typename T0, typename T1, typename ... TNs,
-           Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
-                  DispatcherID, CallArgs...> {
+template <typename T0,
+          typename T1,
+          typename... TNs,
+          Platform platform,
+          typename DispatcherID,
+          typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T0, T1, TNs...>,
+                  DispatcherID,
+                  CallArgs...> {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy = ::RAJA::direct_dispatch<T0, T1, TNs...>;
   using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   using id_type = int;
-  using callable_indices = camp::make_int_seq_t<id_type, 2+sizeof...(TNs)>;
+  using callable_indices = camp::make_int_seq_t<id_type, 2 + sizeof...(TNs)>;
   using callable_types = camp::list<T0, T1, TNs...>;
 
   ///
@@ -560,24 +600,25 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
 
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  dest, src);
+      impl_helper(callable_indices{}, callable_types{}, dest, src);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_ptr_wrapper dest, void_ptr_wrapper src) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper dest,
+                     void_ptr_wrapper src) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(dest, src), 0) : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
       T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -590,19 +631,25 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
 
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj, std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices{},
+                  callable_types{},
+                  obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_cptr_wrapper obj, CallArgs... args) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_cptr_wrapper obj,
+                     CallArgs... args) const
     {
-      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
@@ -614,28 +661,33 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
 
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj, std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices{},
+                  callable_types{},
+                  obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_cptr_wrapper obj, CallArgs... args) const
+    template <int... id_types, typename... Ts>
+    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>,
+                                 camp::list<Ts...>,
+                                 void_cptr_wrapper obj,
+                                 CallArgs... args) const
     {
-      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     RAJA_DEVICE void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
@@ -645,19 +697,19 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
 
     void operator()(void_ptr_wrapper obj) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj);
+      impl_helper(callable_indices{}, callable_types{}, obj);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_ptr_wrapper obj) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper obj) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(obj), 0) : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_ptr_wrapper obj) const
     {
       T* obj_as_T = static_cast<T*>(obj.ptr);
@@ -671,25 +723,32 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   /// The id is just the index of T in the list of callable_types.
   /// If T is not in Ts return -1.
   ///
-  template < typename T, int ... id_types, typename ... Ts >
-  static constexpr id_type get_id(camp::int_seq<int, id_types...>, camp::list<Ts...>)
+  template <typename T, int... id_types, typename... Ts>
+  static constexpr id_type get_id(camp::int_seq<int, id_types...>,
+                                  camp::list<Ts...>)
   {
     id_type id{-1};
     // quiet UB warning by sequencing assignment to id with list initialization
-    int unused[] {0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
-    camp::sink(unused); // quiet unused var warning
+    int unused[]{0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
+    camp::sink(unused);  // quiet unused var warning
     return id;
   }
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+  template <typename T,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    static constexpr id_type id =
+        get_id<T>(callable_indices{}, callable_types{});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {mover_type{id},
+            host_invoker_type{id},
+            destroyer_type{id},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -697,12 +756,19 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    static constexpr id_type id =
+        get_id<T>(callable_indices{}, callable_types{});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {mover_type{id},
+            device_invoker_type{id},
+            destroyer_type{id},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
index 9645f73050..d1945555a6 100644
--- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -18,17 +18,14 @@
 #ifndef RAJA_PATTERN_WORKGROUP_WorkRunner_HPP
 #define RAJA_PATTERN_WORKGROUP_WorkRunner_HPP
 
-#include "RAJA/config.hpp"
-
-#include <utility>
 #include <type_traits>
+#include <utility>
 
-#include "RAJA/policy/sequential/policy.hpp"
-
-#include "RAJA/pattern/forall.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
+#include "RAJA/pattern/forall.hpp"
 #include "RAJA/policy/WorkGroup.hpp"
+#include "RAJA/policy/sequential/policy.hpp"
 
 
 namespace RAJA
@@ -40,18 +37,18 @@ namespace detail
 /*!
  * A body and args holder for storing loops that are being executed in foralls
  */
-template <typename LoopBody, typename ... Args>
-struct HoldBodyArgs_base
-{
+template <typename LoopBody, typename... Args>
+struct HoldBodyArgs_base {
   // NOTE: This constructor is disabled when body_in is not LoopBody
   // to avoid it conflicting with the copy and move constructors
-  template < typename body_in,
-      typename = typename std::enable_if<
-        std::is_same<LoopBody, camp::decay<body_in>>::value>::type >
+  template <typename body_in,
+            typename = typename std::enable_if<
+                std::is_same<LoopBody, camp::decay<body_in>>::value>::type>
   HoldBodyArgs_base(body_in&& body, Args... args)
-    : m_body(std::forward<body_in>(body))
-    , m_arg_tuple(std::forward<Args>(args)...)
-  { }
+      : m_body(std::forward<body_in>(body)),
+        m_arg_tuple(std::forward<Args>(args)...)
+  {
+  }
 
 protected:
   LoopBody m_body;
@@ -62,9 +59,8 @@ struct HoldBodyArgs_base
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the host
  */
-template <typename LoopBody, typename index_type, typename ... Args>
-struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
-{
+template <typename LoopBody, typename index_type, typename... Args>
+struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...> {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
   using base::base;
 
@@ -73,7 +69,7 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
     invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
   }
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -84,9 +80,8 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the device
  */
-template <typename LoopBody, typename index_type, typename ... Args>
-struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
-{
+template <typename LoopBody, typename index_type, typename... Args>
+struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...> {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
   using base::base;
 
@@ -95,7 +90,7 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
     invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
   }
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_DEVICE RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -105,21 +100,24 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 /*!
  * A body and segment holder for storing loops that will be executed as foralls
  */
-template <typename ExecutionPolicy, typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
-struct HoldForall
-{
+template <typename ExecutionPolicy,
+          typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
+struct HoldForall {
   using resource_type = typename resources::get_resource<ExecutionPolicy>::type;
   using HoldBodyArgs = typename std::conditional<
       !type_traits::is_device_exec_policy<ExecutionPolicy>::value,
       HoldBodyArgs_host<LoopBody, index_type, Args...>,
-      HoldBodyArgs_device<LoopBody, index_type, Args...> >::type;
+      HoldBodyArgs_device<LoopBody, index_type, Args...>>::type;
 
-  template < typename segment_in, typename body_in >
+  template <typename segment_in, typename body_in>
   HoldForall(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {
+  }
 
   RAJA_INLINE void operator()(resource_type r, Args... args) const
   {
@@ -143,7 +141,7 @@ template <typename EXEC_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner;
 
 
@@ -156,28 +154,30 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunnerForallOrdered_base
-{
+          typename... Args>
+struct WorkRunnerForallOrdered_base {
   using exec_policy = EXEC_POLICY_T;
   using order_policy = ORDER_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
   using Allocator = ALLOCATOR_T;
   using index_type = INDEX_T;
-  using resource_type = typename resources::get_resource<FORALL_EXEC_POLICY>::type;
+  using resource_type =
+      typename resources::get_resource<FORALL_EXEC_POLICY>::type;
 
   using forall_exec_policy = FORALL_EXEC_POLICY;
 
   // The type that will hold the segment and loop body in work storage
   struct holder_type {
-    template < typename T >
-    using type = HoldForall<forall_exec_policy,
-                            typename camp::at<T, camp::num<0>>::type, // segment_type
-                            typename camp::at<T, camp::num<1>>::type, // loop_type
-                            index_type, Args...>;
+    template <typename T>
+    using type =
+        HoldForall<forall_exec_policy,
+                   typename camp::at<T, camp::num<0>>::type,  // segment_type
+                   typename camp::at<T, camp::num<1>>::type,  // loop_type
+                   index_type,
+                   Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -186,33 +186,41 @@ struct WorkRunnerForallOrdered_base
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::host, dispatcher_holder_policy, void, resource_type, Args...>;
+  using dispatcher_type = Dispatcher<Platform::host,
+                                     dispatcher_holder_policy,
+                                     void,
+                                     resource_type,
+                                     Args...>;
 
   WorkRunnerForallOrdered_base() = default;
 
   WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base const&) = delete;
-  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base const&) = delete;
+  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base const&) =
+      delete;
 
-  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base &&) = default;
-  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base &&) = default;
+  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base&&) = default;
+  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base&&) =
+      default;
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename segment_T, typename loop_T >
+  template <typename WorkContainer, typename segment_T, typename loop_T>
   inline void enqueue(WorkContainer& storage, segment_T&& seg, loop_T&& loop)
   {
-    using holder = holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
+    using holder =
+        holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
 
-    storage.template emplace<holder>(
-        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
-        std::forward<segment_T>(seg), std::forward<loop_T>(loop));
+    storage.template emplace<holder>(get_Dispatcher<holder, dispatcher_type>(
+                                         dispatcher_exec_policy{}),
+                                     std::forward<segment_T>(seg),
+                                     std::forward<loop_T>(loop));
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  { }
+  void clear() {}
 
   // no extra storage required here
   using per_run_storage = int;
@@ -227,29 +235,26 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunnerForallOrdered
-    : WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>
-{
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...> {
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
   // run the loops using forall in the order that they were enqueued
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
@@ -276,29 +281,27 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunnerForallReverse
-    : WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>
-{
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...> {
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
-  // run the loops using forall in the reverse order to the order they were enqueued
-  template < typename WorkContainer >
+  // run the loops using forall in the reverse order to the order they were
+  // enqueued
+  template <typename WorkContainer>
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
@@ -309,7 +312,7 @@ struct WorkRunnerForallReverse
 
     auto begin = storage.begin();
     for (auto iter = storage.end(); iter != begin; --iter) {
-      value_type::host_call(&*(iter-1), r, args...);
+      value_type::host_call(&*(iter - 1), r, args...);
     }
 
     return run_storage;
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
index 52631d108f..6e0216e72d 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -18,19 +18,16 @@
 #ifndef RAJA_PATTERN_WORKGROUP_WorkStorage_HPP
 #define RAJA_PATTERN_WORKGROUP_WorkStorage_HPP
 
-#include "RAJA/config.hpp"
-
 #include <cstddef>
 #include <memory>
-#include <utility>
 #include <type_traits>
+#include <utility>
 
-#include "RAJA/util/Operators.hpp"
-#include "RAJA/util/macros.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/internal/RAJAVec.hpp"
-
 #include "RAJA/pattern/WorkGroup/WorkStruct.hpp"
+#include "RAJA/util/Operators.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -46,9 +43,8 @@ namespace detail
 //   operator -  ( iterator_base const& )
 //   operator == ( iterator_base const& )
 //   operator <  ( iterator_base const& )
-template < typename iterator_base >
-struct random_access_iterator : iterator_base
-{
+template <typename iterator_base>
+struct random_access_iterator : iterator_base {
   using base = iterator_base;
   using value_type = const typename base::value_type;
   using pointer = typename base::pointer;
@@ -59,10 +55,10 @@ struct random_access_iterator : iterator_base
   using base::base;
 
   random_access_iterator(random_access_iterator const&) = default;
-  random_access_iterator(random_access_iterator &&) = default;
+  random_access_iterator(random_access_iterator&&) = default;
 
   random_access_iterator& operator=(random_access_iterator const&) = default;
-  random_access_iterator& operator=(random_access_iterator &&) = default;
+  random_access_iterator& operator=(random_access_iterator&&) = default;
 
 
   RAJA_HOST_DEVICE reference operator*() const
@@ -70,10 +66,7 @@ struct random_access_iterator : iterator_base
     return *static_cast<base const&>(*this);
   }
 
-  RAJA_HOST_DEVICE pointer operator->() const
-  {
-    return &(*(*this));
-  }
+  RAJA_HOST_DEVICE pointer operator->() const { return &(*(*this)); }
 
   RAJA_HOST_DEVICE reference operator[](difference_type i) const
   {
@@ -121,7 +114,8 @@ struct random_access_iterator : iterator_base
   }
 
   RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
-      random_access_iterator const& lhs, difference_type rhs)
+      random_access_iterator const& lhs,
+      difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy += rhs;
@@ -129,7 +123,8 @@ struct random_access_iterator : iterator_base
   }
 
   RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
-      difference_type lhs, random_access_iterator const& rhs)
+      difference_type lhs,
+      random_access_iterator const& rhs)
   {
     random_access_iterator copy = rhs;
     copy += lhs;
@@ -137,7 +132,8 @@ struct random_access_iterator : iterator_base
   }
 
   RAJA_HOST_DEVICE friend inline random_access_iterator operator-(
-      random_access_iterator const& lhs, difference_type rhs)
+      random_access_iterator const& lhs,
+      difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy -= rhs;
@@ -145,43 +141,50 @@ struct random_access_iterator : iterator_base
   }
 
   RAJA_HOST_DEVICE friend inline difference_type operator-(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) - static_cast<base const&>(rhs);
   }
 
   RAJA_HOST_DEVICE friend inline bool operator==(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) == static_cast<base const&>(rhs);
   }
 
   RAJA_HOST_DEVICE friend inline bool operator!=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return !(lhs == rhs);
   }
 
   RAJA_HOST_DEVICE friend inline bool operator<(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) < static_cast<base const&>(rhs);
   }
 
   RAJA_HOST_DEVICE friend inline bool operator<=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return !(rhs < lhs);
   }
 
   RAJA_HOST_DEVICE friend inline bool operator>(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return rhs < lhs;
   }
 
   RAJA_HOST_DEVICE friend inline bool operator>=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return !(lhs < rhs);
   }
@@ -191,10 +194,12 @@ struct random_access_iterator : iterator_base
 /*!
  * A storage container for work groups
  */
-template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename STORAGE_POLICY_T,
+          typename ALLOCATOR_T,
+          typename Dispatcher_T>
 class WorkStorage;
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -202,15 +207,17 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
   using storage_policy = RAJA::array_of_pointers;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
   using value_type = GenericWorkStruct<dispatcher_type>;
@@ -224,31 +231,24 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
 private:
   // struct used in storage vector to retain pointer and allocation size
-  struct pointer_and_size
-  {
+  struct pointer_and_size {
     pointer ptr;
     size_type size;
   };
 
 public:
-
-  // iterator base class for accessing stored WorkStructs outside of the container
-  struct const_iterator_base
-  {
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
+  struct const_iterator_base {
     using value_type = const typename WorkStorage::value_type;
     using pointer = typename WorkStorage::const_pointer;
     using reference = typename WorkStorage::const_reference;
     using difference_type = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
-    const_iterator_base(const pointer_and_size* ptrptr)
-      : m_ptrptr(ptrptr)
-    { }
+    const_iterator_base(const pointer_and_size* ptrptr) : m_ptrptr(ptrptr) {}
 
-    RAJA_HOST_DEVICE reference operator*() const
-    {
-      return *(m_ptrptr->ptr);
-    }
+    RAJA_HOST_DEVICE reference operator*() const { return *(m_ptrptr->ptr); }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
     {
@@ -257,19 +257,22 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     }
 
     RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr - rhs_iter.m_ptrptr;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr == rhs_iter.m_ptrptr;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr < rhs_iter.m_ptrptr;
     }
@@ -282,22 +285,23 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
 
   explicit WorkStorage(allocator_type const& aloc)
-    : m_vec(0, aloc)
-    , m_aloc(aloc)
-  { }
+      : m_vec(0, aloc), m_aloc(aloc)
+  {
+  }
 
   WorkStorage(WorkStorage const&) = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_vec(std::move(rhs.m_vec))
-    , m_aloc(std::move(rhs.m_aloc))
-  { }
+      : m_vec(std::move(rhs.m_vec)), m_aloc(std::move(rhs.m_aloc))
+  {
+  }
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
     if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -312,20 +316,11 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return m_vec.size();
-  }
+  size_type size() const { return m_vec.size(); }
 
-  const_iterator begin() const
-  {
-    return const_iterator(m_vec.begin());
-  }
+  const_iterator begin() const { return const_iterator(m_vec.begin()); }
 
-  const_iterator end() const
-  {
-    return const_iterator(m_vec.end());
-  }
+  const_iterator end() const { return const_iterator(m_vec.end()); }
 
   // number of bytes used for storage of loops
   size_type storage_size() const
@@ -337,11 +332,13 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     return storage_size_nbytes;
   }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
-    m_vec.emplace_back(create_value<holder>(
-        dispatcher, std::forward<holder_ctor_args>(ctor_args)...));
+    m_vec.emplace_back(
+        create_value<holder>(dispatcher,
+                             std::forward<holder_ctor_args>(ctor_args)...));
   }
 
   // destroy all stored loops, deallocates all storage
@@ -354,13 +351,13 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     m_vec.shrink_to_fit();
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<pointer_and_size, typename allocator_traits_type::template rebind_alloc<pointer_and_size>> m_vec;
+  RAJAVec<
+      pointer_and_size,
+      typename allocator_traits_type::template rebind_alloc<pointer_and_size>>
+      m_vec;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
@@ -389,7 +386,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // allocate and construct value in storage
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   pointer_and_size create_value(const dispatcher_type* dispatcher,
                                 holder_ctor_args&&... ctor_args)
   {
@@ -415,7 +412,9 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     value_type::move_destroy(value_ptr, other_value_and_size.ptr);
 
     allocator_traits_type::deallocate(rhs.m_aloc,
-        reinterpret_cast<char*>(other_value_and_size.ptr), other_value_and_size.size);
+                                      reinterpret_cast<char*>(
+                                          other_value_and_size.ptr),
+                                      other_value_and_size.size);
 
     return pointer_and_size{value_ptr, other_value_and_size.size};
   }
@@ -425,11 +424,13 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   {
     value_type::destroy(value_and_size_ptr.ptr);
     allocator_traits_type::deallocate(m_aloc,
-        reinterpret_cast<char*>(value_and_size_ptr.ptr), value_and_size_ptr.size);
+                                      reinterpret_cast<char*>(
+                                          value_and_size_ptr.ptr),
+                                      value_and_size_ptr.size);
   }
 };
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -437,15 +438,17 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
   using storage_policy = RAJA::ragged_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
   using value_type = GenericWorkStruct<dispatcher_type>;
@@ -457,9 +460,9 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   using pointer = value_type*;
   using const_pointer = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the container
-  struct const_iterator_base
-  {
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
+  struct const_iterator_base {
     using value_type = const typename WorkStorage::value_type;
     using pointer = typename WorkStorage::const_pointer;
     using reference = typename WorkStorage::const_reference;
@@ -467,14 +470,13 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_begin, const size_type* offset_iter)
-      : m_array_begin(array_begin)
-      , m_offset_iter(offset_iter)
-    { }
+        : m_array_begin(array_begin), m_offset_iter(offset_iter)
+    {
+    }
 
     RAJA_HOST_DEVICE reference operator*() const
     {
-      return *reinterpret_cast<pointer>(
-          m_array_begin + *m_offset_iter);
+      return *reinterpret_cast<pointer>(m_array_begin + *m_offset_iter);
     }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
@@ -484,19 +486,22 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
     }
 
     RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter - rhs_iter.m_offset_iter;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter == rhs_iter.m_offset_iter;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter < rhs_iter.m_offset_iter;
     }
@@ -510,19 +515,19 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
 
   explicit WorkStorage(allocator_type const& aloc)
-    : m_offsets(0, aloc)
-    , m_aloc(aloc)
-  { }
+      : m_offsets(0, aloc), m_aloc(aloc)
+  {
+  }
 
   WorkStorage(WorkStorage const&) = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_offsets(std::move(rhs.m_offsets))
-    , m_array_begin(rhs.m_array_begin)
-    , m_array_end(rhs.m_array_end)
-    , m_array_cap(rhs.m_array_cap)
-    , m_aloc(std::move(rhs.m_aloc))
+      : m_offsets(std::move(rhs.m_offsets)),
+        m_array_begin(rhs.m_array_begin),
+        m_array_end(rhs.m_array_end),
+        m_array_cap(rhs.m_array_cap),
+        m_aloc(std::move(rhs.m_aloc))
   {
     rhs.m_array_begin = nullptr;
     rhs.m_array_end = nullptr;
@@ -532,7 +537,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   WorkStorage& operator=(WorkStorage&& rhs)
   {
     if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -546,10 +552,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return m_offsets.size();
-  }
+  size_type size() const { return m_offsets.size(); }
 
   const_iterator begin() const
   {
@@ -562,17 +565,17 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of bytes used for storage of loops
-  size_type storage_size() const
-  {
-    return m_array_end - m_array_begin;
-  }
+  size_type storage_size() const { return m_array_end - m_array_begin; }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
     size_type value_offset = storage_size();
-    size_type value_size   = create_value<holder>(value_offset,
-        dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    size_type value_size =
+        create_value<holder>(value_offset,
+                             dispatcher,
+                             std::forward<holder_ctor_args>(ctor_args)...);
     m_offsets.emplace_back(value_offset);
     m_array_end += value_size;
   }
@@ -582,23 +585,24 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   {
     array_clear();
     if (m_array_begin != nullptr) {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      allocator_traits_type::deallocate(m_aloc,
+                                        m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
-      m_array_end   = nullptr;
-      m_array_cap   = nullptr;
+      m_array_end = nullptr;
+      m_array_cap = nullptr;
     }
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<size_type, typename allocator_traits_type::template rebind_alloc<size_type>> m_offsets;
+  RAJAVec<size_type,
+          typename allocator_traits_type::template rebind_alloc<size_type>>
+      m_offsets;
   char* m_array_begin = nullptr;
-  char* m_array_end   = nullptr;
-  char* m_array_cap   = nullptr;
+  char* m_array_end = nullptr;
+  char* m_array_cap = nullptr;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
@@ -606,15 +610,15 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   {
     clear();
 
-    m_offsets     = std::move(rhs.m_offsets);
+    m_offsets = std::move(rhs.m_offsets);
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end  ;
-    m_array_cap   = rhs.m_array_cap  ;
-    m_aloc        = std::move(rhs.m_aloc);
+    m_array_end = rhs.m_array_end;
+    m_array_cap = rhs.m_array_cap;
+    m_aloc = std::move(rhs.m_aloc);
 
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end   = nullptr;
-    rhs.m_array_cap   = nullptr;
+    rhs.m_array_end = nullptr;
+    rhs.m_array_cap = nullptr;
   }
 
   // move assignment if allocator does not propagate on move assignment
@@ -623,14 +627,14 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
     clear();
     if (m_aloc == rhs.m_aloc) {
 
-      m_offsets     = std::move(rhs.m_offsets);
+      m_offsets = std::move(rhs.m_offsets);
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end  ;
-      m_array_cap   = rhs.m_array_cap  ;
+      m_array_end = rhs.m_array_end;
+      m_array_cap = rhs.m_array_cap;
 
       rhs.m_array_begin = nullptr;
-      rhs.m_array_end   = nullptr;
-      rhs.m_array_cap   = nullptr;
+      rhs.m_array_end = nullptr;
+      rhs.m_array_cap = nullptr;
     } else {
       array_reserve(rhs.storage_size());
 
@@ -647,16 +651,10 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // get loop storage capacity, used and unused in bytes
-  size_type storage_capacity() const
-  {
-    return m_array_cap - m_array_begin;
-  }
+  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
 
   // get unused loop storage capacity in bytes
-  size_type storage_unused() const
-  {
-    return m_array_cap - m_array_end;
-  }
+  size_type storage_unused() const { return m_array_cap - m_array_end; }
 
   // reserve space for loop_storage_size bytes of loop storage
   void array_reserve(size_type loop_storage_size)
@@ -665,21 +663,23 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end   = new_array_begin + storage_size();
-      char* new_array_cap   = new_array_begin + loop_storage_size;
+      char* new_array_end = new_array_begin + storage_size();
+      char* new_array_cap = new_array_begin + loop_storage_size;
 
       for (size_type i = 0; i < size(); ++i) {
         move_destroy_value(new_array_begin + m_offsets[i],
-                             m_array_begin + m_offsets[i]);
+                           m_array_begin + m_offsets[i]);
       }
 
       if (m_array_begin != nullptr) {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+        allocator_traits_type::deallocate(m_aloc,
+                                          m_array_begin,
+                                          storage_capacity());
       }
 
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end  ;
-      m_array_cap   = new_array_cap  ;
+      m_array_end = new_array_end;
+      m_array_cap = new_array_cap;
     }
   }
 
@@ -696,7 +696,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
   // ensure there is enough storage to hold the next loop body at value offset
   // and store the loop body
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   size_type create_value(size_type value_offset,
                          const dispatcher_type* dispatcher,
                          holder_ctor_args&&... ctor_args)
@@ -704,7 +704,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
     const size_type value_size = sizeof(true_value_type<holder>);
 
     if (value_size > storage_unused()) {
-      array_reserve(std::max(storage_size() + value_size, 2*storage_capacity()));
+      array_reserve(
+          std::max(storage_size() + value_size, 2 * storage_capacity()));
     }
 
     pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
@@ -726,13 +727,12 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr =
-        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::constant_stride_array_of_objects,
                   ALLOCATOR_T,
                   Dispatcher_T>
@@ -742,15 +742,17 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
   using storage_policy = RAJA::constant_stride_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
   using value_type = GenericWorkStruct<dispatcher_type>;
@@ -762,9 +764,9 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   using pointer = value_type*;
   using const_pointer = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the container
-  struct const_iterator_base
-  {
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
+  struct const_iterator_base {
     using value_type = const typename WorkStorage::value_type;
     using pointer = typename WorkStorage::const_pointer;
     using reference = typename WorkStorage::const_reference;
@@ -772,9 +774,9 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_pos, size_type stride)
-      : m_array_pos(array_pos)
-      , m_stride(stride)
-    { }
+        : m_array_pos(array_pos), m_stride(stride)
+    {
+    }
 
     RAJA_HOST_DEVICE reference operator*() const
     {
@@ -788,19 +790,22 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     }
 
     RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return (lhs_iter.m_array_pos - rhs_iter.m_array_pos) / lhs_iter.m_stride;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos == rhs_iter.m_array_pos;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos < rhs_iter.m_array_pos;
     }
@@ -813,30 +818,29 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   using const_iterator = random_access_iterator<const_iterator_base>;
 
 
-  explicit WorkStorage(allocator_type const& aloc)
-    : m_aloc(aloc)
-  { }
+  explicit WorkStorage(allocator_type const& aloc) : m_aloc(aloc) {}
 
   WorkStorage(WorkStorage const&) = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_aloc(std::move(rhs.m_aloc))
-    , m_stride(rhs.m_stride)
-    , m_array_begin(rhs.m_array_begin)
-    , m_array_end(rhs.m_array_end)
-    , m_array_cap(rhs.m_array_cap)
+      : m_aloc(std::move(rhs.m_aloc)),
+        m_stride(rhs.m_stride),
+        m_array_begin(rhs.m_array_begin),
+        m_array_end(rhs.m_array_end),
+        m_array_cap(rhs.m_array_cap)
   {
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end   = nullptr;
-    rhs.m_array_cap   = nullptr;
+    rhs.m_array_end = nullptr;
+    rhs.m_array_cap = nullptr;
   }
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
     if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -847,35 +851,28 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   {
     size_type num_storage_loops =
         std::max(num_loops, (loop_storage_size + m_stride - 1) / m_stride);
-    array_reserve(num_storage_loops*m_stride, m_stride);
+    array_reserve(num_storage_loops * m_stride, m_stride);
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return storage_size() / m_stride;
-  }
+  size_type size() const { return storage_size() / m_stride; }
 
   const_iterator begin() const
   {
     return const_iterator(m_array_begin, m_stride);
   }
 
-  const_iterator end() const
-  {
-    return const_iterator(m_array_end, m_stride);
-  }
+  const_iterator end() const { return const_iterator(m_array_end, m_stride); }
 
   // amount of storage in bytes used to store loops
-  size_type storage_size() const
-  {
-    return m_array_end - m_array_begin;
-  }
+  size_type storage_size() const { return m_array_end - m_array_begin; }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
-    create_value<holder>(dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    create_value<holder>(dispatcher,
+                         std::forward<holder_ctor_args>(ctor_args)...);
     m_array_end += m_stride;
   }
 
@@ -884,40 +881,39 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   {
     array_clear();
     if (m_array_begin != nullptr) {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      allocator_traits_type::deallocate(m_aloc,
+                                        m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
-      m_array_end   = nullptr;
-      m_array_cap   = nullptr;
+      m_array_end = nullptr;
+      m_array_cap = nullptr;
     }
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
   allocator_type m_aloc;
-  size_type m_stride     = 1; // can't be 0 because size divides stride
+  size_type m_stride = 1;  // can't be 0 because size divides stride
   char* m_array_begin = nullptr;
-  char* m_array_end   = nullptr;
-  char* m_array_cap   = nullptr;
+  char* m_array_end = nullptr;
+  char* m_array_cap = nullptr;
 
   // move assignment if allocator propagates on move assignment
   void move_assign_private(WorkStorage&& rhs, std::true_type)
   {
     clear();
 
-    m_aloc        = std::move(rhs.m_aloc);
-    m_stride      = rhs.m_stride     ;
+    m_aloc = std::move(rhs.m_aloc);
+    m_stride = rhs.m_stride;
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end  ;
-    m_array_cap   = rhs.m_array_cap  ;
+    m_array_end = rhs.m_array_end;
+    m_array_cap = rhs.m_array_cap;
 
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end   = nullptr;
-    rhs.m_array_cap   = nullptr;
+    rhs.m_array_end = nullptr;
+    rhs.m_array_cap = nullptr;
   }
 
   // move assignment if allocator does not propagate on move assignment
@@ -926,15 +922,15 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     clear();
     if (m_aloc == rhs.m_aloc) {
 
-      m_stride      = rhs.m_stride     ;
+      m_stride = rhs.m_stride;
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end  ;
-      m_array_cap   = rhs.m_array_cap  ;
+      m_array_end = rhs.m_array_end;
+      m_array_cap = rhs.m_array_cap;
 
       // do not reset stride, leave it for reuse
       rhs.m_array_begin = nullptr;
-      rhs.m_array_end   = nullptr;
-      rhs.m_array_cap   = nullptr;
+      rhs.m_array_end = nullptr;
+      rhs.m_array_cap = nullptr;
     } else {
 
       m_stride = rhs.m_stride;
@@ -950,16 +946,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   }
 
   // storage capacity, used and unused, in bytes
-  size_type storage_capacity() const
-  {
-    return m_array_cap - m_array_begin;
-  }
+  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
 
   // unused storage capacity in bytes
-  size_type storage_unused() const
-  {
-    return m_array_cap - m_array_end;
-  }
+  size_type storage_unused() const { return m_array_cap - m_array_end; }
 
   // allocate enough storage for loop_storage_size bytes with
   // each loop body separated by new_stride bytes
@@ -972,29 +962,32 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end   = new_array_begin + size() * new_stride;
-      char* new_array_cap   = new_array_begin + loop_storage_size;
+      char* new_array_end = new_array_begin + size() * new_stride;
+      char* new_array_cap = new_array_begin + loop_storage_size;
 
       for (size_type i = 0; i < size(); ++i) {
         move_destroy_value(new_array_begin + i * new_stride,
-                             m_array_begin + i *   m_stride);
+                           m_array_begin + i * m_stride);
       }
 
       if (m_array_begin != nullptr) {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+        allocator_traits_type::deallocate(m_aloc,
+                                          m_array_begin,
+                                          storage_capacity());
       }
 
-      m_stride      = new_stride     ;
+      m_stride = new_stride;
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end  ;
-      m_array_cap   = new_array_cap  ;
+      m_array_end = new_array_end;
+      m_array_cap = new_array_cap;
     }
   }
 
   // destroy the loops in storage (does not deallocate loop storage)
   void array_clear()
   {
-    for (size_type value_offset = storage_size(); value_offset > 0; value_offset -= m_stride) {
+    for (size_type value_offset = storage_size(); value_offset > 0;
+         value_offset -= m_stride) {
       destroy_value(value_offset - m_stride);
       m_array_end -= m_stride;
     }
@@ -1002,18 +995,17 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // ensure there is enough storage to store the loop body
   // and construct the body in storage.
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   void create_value(const dispatcher_type* dispatcher,
                     holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
     if (value_size > storage_unused() && value_size <= m_stride) {
-      array_reserve(std::max(storage_size() + m_stride, 2*storage_capacity()),
+      array_reserve(std::max(storage_size() + m_stride, 2 * storage_capacity()),
                     m_stride);
     } else if (value_size > m_stride) {
-      array_reserve((size()+1)*value_size,
-                    value_size);
+      array_reserve((size() + 1) * value_size, value_size);
     }
 
     size_type value_offset = storage_size();
@@ -1025,8 +1017,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // move construct the loop body in value from other and
   // destroy the loop body in other
-  void move_destroy_value(char* value_ptr,
-                          char* other_value_ptr)
+  void move_destroy_value(char* value_ptr, char* other_value_ptr)
   {
     value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
                              reinterpret_cast<pointer>(other_value_ptr));
@@ -1035,8 +1026,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr =
-        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
index 72e1540c54..39a35c5176 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -18,11 +18,10 @@
 #ifndef RAJA_PATTERN_WORKGROUP_WorkStruct_HPP
 #define RAJA_PATTERN_WORKGROUP_WorkStruct_HPP
 
-#include "RAJA/config.hpp"
-
-#include <utility>
 #include <cstddef>
+#include <utility>
 
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
 
 
@@ -35,7 +34,7 @@ namespace detail
 /*!
  * A struct that gives a generic way to layout memory for different loops
  */
-template < size_t size, typename Dispatcher_T >
+template <size_t size, typename Dispatcher_T>
 struct WorkStruct;
 
 /*!
@@ -44,67 +43,74 @@ struct WorkStruct;
  *   offsetof(GenericWorkStruct<>, obj) == offsetof(WorkStruct<size>, obj)
  *   sizeof(GenericWorkStruct) <= sizeof(WorkStruct<size>)
  */
-template < typename Dispatcher_T >
+template <typename Dispatcher_T>
 using GenericWorkStruct = WorkStruct<RAJA_MAX_ALIGN, Dispatcher_T>;
 
-template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
-struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
-{
-  using dispatcher_type = Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
+template <size_t size,
+          Platform platform,
+          typename dispatch_policy,
+          typename DispatcherID,
+          typename... CallArgs>
+struct WorkStruct<
+    size,
+    Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>> {
+  using dispatcher_type =
+      Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
 
   // construct a WorkStruct with a value of type holder from the args and
   // check a variety of constraints at compile time
-  template < typename holder, typename ... holder_ctor_args >
-  static RAJA_INLINE
-  void construct(void* ptr, const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  static RAJA_INLINE void construct(void* ptr,
+                                    const dispatcher_type* dispatcher,
+                                    holder_ctor_args&&... ctor_args)
   {
     using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
     using value_type = GenericWorkStruct<dispatcher_type>;
 
     static_assert(sizeof(holder) <= sizeof(true_value_type::obj),
-        "holder must fit in WorkStruct::obj");
+                  "holder must fit in WorkStruct::obj");
     static_assert(std::is_standard_layout<true_value_type>::value,
-        "WorkStruct must be a standard layout type");
+                  "WorkStruct must be a standard layout type");
     static_assert(std::is_standard_layout<value_type>::value,
-        "GenericWorkStruct must be a standard layout type");
+                  "GenericWorkStruct must be a standard layout type");
     static_assert(offsetof(value_type, obj) == offsetof(true_value_type, obj),
-        "WorkStruct and GenericWorkStruct must have obj at the same offset");
+                  "WorkStruct and GenericWorkStruct must have obj at the same "
+                  "offset");
     static_assert(sizeof(value_type) <= sizeof(true_value_type),
-        "WorkStruct must not be smaller than GenericWorkStruct");
+                  "WorkStruct must not be smaller than GenericWorkStruct");
     true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
 
     value_ptr->dispatcher = dispatcher;
     value_ptr->invoke = dispatcher->invoke;
-    new(&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
+    new (&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
   }
 
   // move construct in dst from the value in src and destroy the value in src
-  static RAJA_INLINE
-  void move_destroy(WorkStruct* value_dst,
-                    WorkStruct* value_src)
+  static RAJA_INLINE void move_destroy(WorkStruct* value_dst,
+                                       WorkStruct* value_src)
   {
     value_dst->dispatcher = value_src->dispatcher;
     value_dst->invoke = value_src->invoke;
-    value_dst->dispatcher->move_construct_destroy(&value_dst->obj, &value_src->obj);
+    value_dst->dispatcher->move_construct_destroy(&value_dst->obj,
+                                                  &value_src->obj);
   }
 
   // destroy the value ptr
-  static RAJA_INLINE
-  void destroy(WorkStruct* value_ptr)
+  static RAJA_INLINE void destroy(WorkStruct* value_ptr)
   {
     value_ptr->dispatcher->destroy(&value_ptr->obj);
   }
 
   // invoke the call operator of the value ptr with args
-  static RAJA_INLINE
-  void host_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_INLINE void host_call(const WorkStruct* value_ptr,
+                                    CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
   ///
   // invoke the call operator of the value ptr with args
-  static RAJA_DEVICE RAJA_INLINE
-  void device_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_DEVICE RAJA_INLINE void device_call(const WorkStruct* value_ptr,
+                                                  CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index d5905f7928..e0fbe00451 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -19,10 +19,8 @@
 #define RAJA_pattern_atomic_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/policy/atomic_auto.hpp"
 #include "RAJA/policy/atomic_builtin.hpp"
-
 #include "RAJA/util/macros.hpp"
 
 namespace RAJA
@@ -317,22 +315,19 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr explicit AtomicRef(value_type *value_ptr)
-      : m_value_ptr(value_ptr) {}
+  constexpr explicit AtomicRef(value_type *value_ptr) : m_value_ptr(value_ptr)
+  {
+  }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr AtomicRef(AtomicRef const &c)
-      : m_value_ptr(c.m_value_ptr) {}
+  constexpr AtomicRef(AtomicRef const &c) : m_value_ptr(c.m_value_ptr) {}
 
-  AtomicRef& operator=(AtomicRef const&) = delete;
+  AtomicRef &operator=(AtomicRef const &) = delete;
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type * getPointer() const
-  {
-    return m_value_ptr;
-  }
+  value_type *getPointer() const { return m_value_ptr; }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -351,17 +346,11 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type load() const
-  {
-    return RAJA::atomicLoad<Policy>(m_value_ptr);
-  }
+  value_type load() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  operator value_type() const
-  {
-    return RAJA::atomicLoad<Policy>(m_value_ptr);
-  }
+  operator value_type() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -379,7 +368,7 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  bool compare_exchange_strong(value_type& expect, value_type rhs) const
+  bool compare_exchange_strong(value_type &expect, value_type rhs) const
   {
     value_type compare = expect;
     value_type old = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
@@ -393,7 +382,7 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  bool compare_exchange_weak(value_type& expect, value_type rhs) const
+  bool compare_exchange_weak(value_type &expect, value_type rhs) const
   {
     return this->compare_exchange_strong(expect, rhs);
   }
diff --git a/include/RAJA/pattern/detail/algorithm.hpp b/include/RAJA/pattern/detail/algorithm.hpp
index 21d266bd21..ff2fe33f00 100644
--- a/include/RAJA/pattern/detail/algorithm.hpp
+++ b/include/RAJA/pattern/detail/algorithm.hpp
@@ -20,12 +20,12 @@
 #ifndef RAJA_pattern_detail_algorithm_HPP
 #define RAJA_pattern_detail_algorithm_HPP
 
+#include <iterator>
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/macros.hpp"
 #include "camp/helpers.hpp"
 
-#include <iterator>
-
 namespace RAJA
 {
 
@@ -49,16 +49,17 @@ using ContainerVal =
     camp::decay<decltype(*camp::val<camp::iterator_from<Container>>())>;
 
 template <typename Container>
-using ContainerRef =
-    decltype(*camp::val<camp::iterator_from<Container>>());
+using ContainerRef = decltype(*camp::val<camp::iterator_from<Container>>());
 
 template <typename Container>
 using ContainerDiff =
-    camp::decay<decltype(camp::val<camp::iterator_from<Container>>()-camp::val<camp::iterator_from<Container>>())>;
+    camp::decay<decltype(camp::val<camp::iterator_from<Container>>() -
+                         camp::val<camp::iterator_from<Container>>())>;
 
 template <typename DiffType, typename CountType>
-RAJA_INLINE
-DiffType firstIndex(DiffType n, CountType num_threads, CountType thread_id)
+RAJA_INLINE DiffType firstIndex(DiffType n,
+                                CountType num_threads,
+                                CountType thread_id)
 {
   return (static_cast<size_t>(n) * thread_id) / num_threads;
 }
@@ -70,9 +71,7 @@ DiffType firstIndex(DiffType n, CountType num_threads, CountType thread_id)
     \brief swap values at iterators lhs and rhs
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-safe_iter_swap(Iter lhs, Iter rhs)
+RAJA_HOST_DEVICE RAJA_INLINE void safe_iter_swap(Iter lhs, Iter rhs)
 {
 #ifdef RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
   using camp::safe_swap;
@@ -87,9 +86,7 @@ safe_iter_swap(Iter lhs, Iter rhs)
     \brief returns iterator to next item
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-next(Iter it)
+RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it)
 {
   ++it;
   return it;
@@ -99,9 +96,7 @@ next(Iter it)
     \brief returns iterator to next item
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-prev(Iter it)
+RAJA_HOST_DEVICE RAJA_INLINE Iter prev(Iter it)
 {
   --it;
   return it;
diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp
index 884b9aa989..1c8edcf3f2 100644
--- a/include/RAJA/pattern/detail/multi_reduce.hpp
+++ b/include/RAJA/pattern/detail/multi_reduce.hpp
@@ -19,39 +19,34 @@
 #define RAJA_PATTERN_DETAIL_MULTI_REDUCE_HPP
 
 #include "RAJA/pattern/detail/forall.hpp"
-
-#include "RAJA/util/macros.hpp"
 #include "RAJA/util/Operators.hpp"
-#include "RAJA/util/types.hpp"
 #include "RAJA/util/RepeatView.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 
 
-#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)    \
-  template <typename tuning, typename T>                      \
-  struct MultiReduce##OP_NAME<POL<tuning>, T>                 \
-      : reduce::detail::BaseMultiReduce##OP_NAME<             \
-            DATA<T, RAJA::reduce::OP<T>, tuning>>             \
-  {                                                           \
-    using policy = POL<tuning>;                               \
-    using Base = reduce::detail::BaseMultiReduce##OP_NAME<    \
-        DATA<T, RAJA::reduce::OP<T>, tuning>>;                \
-    using Base::Base;                                         \
-    using typename Base::value_type;                          \
-    using typename Base::reference;                           \
-                                                              \
-    RAJA_SUPPRESS_HD_WARN                                     \
-    RAJA_HOST_DEVICE                                          \
-    reference operator[](size_t bin) const                    \
-    {                                                         \
-      return reference(*this, bin);                           \
-    }                                                         \
+#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)                   \
+  template <typename tuning, typename T>                                     \
+  struct MultiReduce##OP_NAME<POL<tuning>, T>                                \
+      : reduce::detail::BaseMultiReduce##OP_NAME<                            \
+            DATA<T, RAJA::reduce::OP<T>, tuning>> {                          \
+    using policy = POL<tuning>;                                              \
+    using Base = reduce::detail::BaseMultiReduce##OP_NAME<                   \
+        DATA<T, RAJA::reduce::OP<T>, tuning>>;                               \
+    using Base::Base;                                                        \
+    using typename Base::value_type;                                         \
+    using typename Base::reference;                                          \
+                                                                             \
+    RAJA_SUPPRESS_HD_WARN                                                    \
+    RAJA_HOST_DEVICE                                                         \
+    reference operator[](size_t bin) const { return reference(*this, bin); } \
   };
 
-#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)            \
-  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)        \
+#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)     \
+  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)      \
+  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)      \
+  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)      \
+  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA) \
   RAJA_DECLARE_MULTI_REDUCER(BitAnd, and_bit, POL, DATA)
 
 namespace RAJA
@@ -64,34 +59,40 @@ namespace detail
 {
 
 template <typename t_MultiReduceData>
-struct BaseMultiReduce
-{
+struct BaseMultiReduce {
   using MultiReduceData = t_MultiReduceData;
   using MultiReduceOp = typename t_MultiReduceData::MultiReduceOp;
   using value_type = typename t_MultiReduceData::value_type;
 
-  BaseMultiReduce() : BaseMultiReduce{RepeatView<value_type>(MultiReduceOp::identity(), 0)} {}
+  BaseMultiReduce()
+      : BaseMultiReduce{RepeatView<value_type>(MultiReduceOp::identity(), 0)}
+  {
+  }
 
   explicit BaseMultiReduce(size_t num_bins,
                            value_type init_val = MultiReduceOp::identity(),
                            value_type identity = MultiReduceOp::identity())
       : BaseMultiReduce{RepeatView<value_type>(init_val, num_bins), identity}
-  { }
+  {
+  }
 
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>,
-                                   concepts::negate<std::is_convertible<Container, size_t>>,
-                                   concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* = nullptr >
-  explicit BaseMultiReduce(Container const& container,
+  template <typename Container,
+            concepts::enable_if_t<
+                type_traits::is_range<Container>,
+                concepts::negate<std::is_convertible<Container, size_t>>,
+                concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>
+                * = nullptr>
+  explicit BaseMultiReduce(Container const &container,
                            value_type identity = MultiReduceOp::identity())
       : data{container, identity}
-  { }
+  {
+  }
 
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduce(BaseMultiReduce const&) = default;
+  BaseMultiReduce(BaseMultiReduce const &) = default;
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduce(BaseMultiReduce &&) = default;
-  BaseMultiReduce &operator=(BaseMultiReduce const&) = delete;
+  BaseMultiReduce &operator=(BaseMultiReduce const &) = delete;
   BaseMultiReduce &operator=(BaseMultiReduce &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduce() = default;
@@ -108,13 +109,13 @@ struct BaseMultiReduce
     reset(RepeatView<value_type>(init_val, num_bins), identity);
   }
 
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
-  void reset(Container const& container,
+  template <typename Container,
+            concepts::enable_if_t<type_traits::is_range<Container>> * = nullptr>
+  void reset(Container const &container,
              value_type identity = MultiReduceOp::identity())
   {
     for (size_t bin = 0; bin < data.num_bins(); ++bin) {
-      RAJA_UNUSED_VAR(get(bin)); // automatic get() before reset
+      RAJA_UNUSED_VAR(get(bin));  // automatic get() before reset
     }
     data.reset(container, identity);
   }
@@ -125,7 +126,7 @@ struct BaseMultiReduce
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseMultiReduce const& combine(size_t bin, value_type const &other) const
+  BaseMultiReduce const &combine(size_t bin, value_type const &other) const
   {
     data.combine(bin, other);
     return *this;
@@ -135,16 +136,18 @@ struct BaseMultiReduce
   value_type get(size_t bin) const { return data.get(bin); }
 
   //! Get the calculated reduced value for each bin and store it in container
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
-  void get_all(Container& container) const
+  template <typename Container,
+            concepts::enable_if_t<type_traits::is_range<Container>> * = nullptr>
+  void get_all(Container &container) const
   {
     RAJA_EXTRACT_BED_IT(container);
     if (size_t(distance_it) != data.num_bins()) {
-      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size than multi reducer");
+      RAJA_ABORT_OR_THROW(
+          "MultiReduce::get_all container has different size than multi "
+          "reducer");
     }
     size_t bin = 0;
-    for (auto& val : container) {
+    for (auto &val : container) {
       val = data.get(bin);
       ++bin;
     }
@@ -167,42 +170,39 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
 {
 public:
   using Base = BaseMultiReduce<MultiReduceData>;
-  using typename Base::value_type;
   using Base::Base;
+  using typename Base::value_type;
 
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin(BaseMultiReduceMin const&) = default;
+  BaseMultiReduceMin(BaseMultiReduceMin const &) = default;
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMin(BaseMultiReduceMin &&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin &operator=(BaseMultiReduceMin const&) = delete;
+  BaseMultiReduceMin &operator=(BaseMultiReduceMin const &) = delete;
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMin &operator=(BaseMultiReduceMin &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMin() = default;
 
-  struct reference
-  {
+  struct reference {
     RAJA_HOST_DEVICE
-    reference(BaseMultiReduceMin const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+    reference(BaseMultiReduceMin const &base, size_t bin)
+        : m_base(base), m_bin(bin)
+    {
+    }
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
-    reference const& min(value_type rhs) const
+    reference const &min(value_type rhs) const
     {
       m_base.combine(m_bin, rhs);
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
-    BaseMultiReduceMin const& m_base;
+    BaseMultiReduceMin const &m_base;
     size_t m_bin;
   };
 };
@@ -224,36 +224,33 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   using Base::Base;
 
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMax(BaseMultiReduceMax const&) = default;
+  BaseMultiReduceMax(BaseMultiReduceMax const &) = default;
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMax(BaseMultiReduceMax &&) = default;
-  BaseMultiReduceMax &operator=(BaseMultiReduceMax const&) = delete;
+  BaseMultiReduceMax &operator=(BaseMultiReduceMax const &) = delete;
   BaseMultiReduceMax &operator=(BaseMultiReduceMax &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMax() = default;
 
-  struct reference
-  {
+  struct reference {
     RAJA_HOST_DEVICE
-    reference(BaseMultiReduceMax const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+    reference(BaseMultiReduceMax const &base, size_t bin)
+        : m_base(base), m_bin(bin)
+    {
+    }
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
-    reference const& max(value_type rhs) const
+    reference const &max(value_type rhs) const
     {
       m_base.combine(m_bin, rhs);
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
-    BaseMultiReduceMax const& m_base;
+    BaseMultiReduceMax const &m_base;
     size_t m_bin;
   };
 };
@@ -275,36 +272,33 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   using Base::Base;
 
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceSum(BaseMultiReduceSum const&) = default;
+  BaseMultiReduceSum(BaseMultiReduceSum const &) = default;
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceSum(BaseMultiReduceSum &&) = default;
-  BaseMultiReduceSum &operator=(BaseMultiReduceSum const&) = delete;
+  BaseMultiReduceSum &operator=(BaseMultiReduceSum const &) = delete;
   BaseMultiReduceSum &operator=(BaseMultiReduceSum &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceSum() = default;
 
-  struct reference
-  {
+  struct reference {
     RAJA_HOST_DEVICE
-    reference(BaseMultiReduceSum const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+    reference(BaseMultiReduceSum const &base, size_t bin)
+        : m_base(base), m_bin(bin)
+    {
+    }
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
-    reference const& operator+=(value_type rhs) const
+    reference const &operator+=(value_type rhs) const
     {
       m_base.combine(m_bin, rhs);
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
-    BaseMultiReduceSum const& m_base;
+    BaseMultiReduceSum const &m_base;
     size_t m_bin;
   };
 };
@@ -326,36 +320,33 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   using Base::Base;
 
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitOr(BaseMultiReduceBitOr const&) = default;
+  BaseMultiReduceBitOr(BaseMultiReduceBitOr const &) = default;
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitOr(BaseMultiReduceBitOr &&) = default;
-  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr const&) = delete;
+  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr const &) = delete;
   BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitOr() = default;
 
-  struct reference
-  {
+  struct reference {
     RAJA_HOST_DEVICE
-    reference(BaseMultiReduceBitOr const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+    reference(BaseMultiReduceBitOr const &base, size_t bin)
+        : m_base(base), m_bin(bin)
+    {
+    }
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
-    reference const& operator|=(value_type rhs) const
+    reference const &operator|=(value_type rhs) const
     {
       m_base.combine(m_bin, rhs);
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
-    BaseMultiReduceBitOr const& m_base;
+    BaseMultiReduceBitOr const &m_base;
     size_t m_bin;
   };
 };
@@ -377,36 +368,33 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   using Base::Base;
 
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd const&) = default;
+  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd const &) = default;
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitAnd(BaseMultiReduceBitAnd &&) = default;
-  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd const&) = delete;
+  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd const &) = delete;
   BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitAnd() = default;
 
-  struct reference
-  {
+  struct reference {
     RAJA_HOST_DEVICE
-    reference(BaseMultiReduceBitAnd const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+    reference(BaseMultiReduceBitAnd const &base, size_t bin)
+        : m_base(base), m_bin(bin)
+    {
+    }
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
-    reference const& operator&=(value_type rhs) const
+    reference const &operator&=(value_type rhs) const
     {
       m_base.combine(m_bin, rhs);
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
-    BaseMultiReduceBitAnd const& m_base;
+    BaseMultiReduceBitAnd const &m_base;
     size_t m_bin;
   };
 };
diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp
index 788f3c698d..0fa1369763 100644
--- a/include/RAJA/pattern/detail/reduce.hpp
+++ b/include/RAJA/pattern/detail/reduce.hpp
@@ -41,13 +41,13 @@
     using Base::Base;                                                    \
   };
 
-#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)       \
-  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)             \
-  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)             \
-  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)             \
-  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)    \
-  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)    \
-  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)           \
+#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)    \
+  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)          \
+  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)          \
+  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)          \
+  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER) \
+  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER) \
+  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)        \
   RAJA_DECLARE_REDUCER(BitAnd, POL, COMBINER)
 
 namespace RAJA
@@ -107,7 +107,8 @@ namespace detail
 {
 
 template <typename T, bool = std::is_integral<T>::value>
-struct DefaultLoc {};
+struct DefaultLoc {
+};
 
 template <typename T>
 struct DefaultLoc<T, false>  // any non-integral type
@@ -116,8 +117,7 @@ struct DefaultLoc<T, false>  // any non-integral type
 };
 
 template <typename T>
-struct DefaultLoc<T, true>
-{
+struct DefaultLoc<T, true> {
   RAJA_HOST_DEVICE constexpr T value() const { return -1; }
 };
 
@@ -128,18 +128,30 @@ class ValueLoc
   T val = doing_min ? operators::limits<T>::max() : operators::limits<T>::min();
   IndexType loc = DefaultLoc<IndexType>().value();
 
-#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 || defined(__HIPCC__)
+#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 || \
+    defined(__HIPCC__)
   RAJA_HOST_DEVICE constexpr ValueLoc() {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const &other) : val{other.val}, loc{other.loc} {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const &other)
+      : val{other.val}, loc{other.loc}
+  {
+  }
   RAJA_HOST_DEVICE
-  ValueLoc &operator=(ValueLoc const &other) { val = other.val; loc = other.loc; return *this;}
+  ValueLoc &operator=(ValueLoc const &other)
+  {
+    val = other.val;
+    loc = other.loc;
+    return *this;
+  }
 #else
   constexpr ValueLoc() = default;
   constexpr ValueLoc(ValueLoc const &) = default;
   ValueLoc &operator=(ValueLoc const &) = default;
 #endif
 
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_) : val{val_}, loc{DefaultLoc<IndexType>().value()} {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_)
+      : val{val_}, loc{DefaultLoc<IndexType>().value()}
+  {
+  }
   RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_, IndexType const &loc_)
       : val{val_}, loc{loc_}
   {
@@ -165,13 +177,15 @@ namespace operators
 {
 template <typename T, typename IndexType, bool B>
 struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
-  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> min()
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
+      ValueLoc<T, IndexType, B>
+      min()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::min());
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
-  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> max()
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
+      ValueLoc<T, IndexType, B>
+      max()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::max());
   }
@@ -215,7 +229,7 @@ class BaseReduce
   RAJA_HOST_DEVICE
   void reset(T val, T identity_ = Reduce::identity())
   {
-    operator T(); // automatic get() before reset
+    operator T();  // automatic get() before reset
     c.reset(val, identity_);
   }
 
@@ -350,7 +364,10 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
  *
  **************************************************************************
  */
-template <typename T, typename IndexType, template <typename, typename> class Combiner>
+template <typename T,
+          typename IndexType,
+          template <typename, typename>
+          class Combiner>
 class BaseReduceMinLoc
     : public BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>
 {
@@ -362,19 +379,24 @@ class BaseReduceMinLoc
 
   constexpr BaseReduceMinLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMinLoc(T init_val, IndexType init_idx,
-                             T identity_val_ = reduce_type::identity(),
-                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
+  constexpr BaseReduceMinLoc(
+      T init_val,
+      IndexType init_idx,
+      T identity_val_ = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
   {
   }
 
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val_ = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
+    operator T();  // automatic get() before reset
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   /// \brief reducer function; updates the current instance's state
@@ -495,31 +517,41 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
  *
  **************************************************************************
  */
-template <typename T, typename IndexType, template <typename, typename> class Combiner>
-class BaseReduceMaxLoc
-    : public BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>
+template <typename T,
+          typename IndexType,
+          template <typename, typename>
+          class Combiner>
+class BaseReduceMaxLoc : public BaseReduce<ValueLoc<T, IndexType, false>,
+                                           RAJA::reduce::max,
+                                           Combiner>
 {
 public:
-  using Base = BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
+  using Base =
+      BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
   using value_type = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMaxLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMaxLoc(T init_val, IndexType init_idx,
-                             T identity_val_ = reduce_type::identity(),
-                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
+  constexpr BaseReduceMaxLoc(
+      T init_val,
+      IndexType init_idx,
+      T identity_val_ = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
   {
   }
 
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val_ = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
+    operator T();  // automatic get() before reset
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index e75cc43af7..bcd8afd8c6 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -52,36 +52,27 @@
 #ifndef RAJA_forall_generic_HPP
 #define RAJA_forall_generic_HPP
 
-#include "RAJA/config.hpp"
-
 #include <functional>
 #include <iterator>
 #include <type_traits>
 
-#include "RAJA/internal/Iterators.hpp"
-
-#include "RAJA/policy/PolicyBase.hpp"
-#include "RAJA/policy/MultiPolicy.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/index/IndexSet.hpp"
 #include "RAJA/index/ListSegment.hpp"
 #include "RAJA/index/RangeSegment.hpp"
-
+#include "RAJA/internal/Iterators.hpp"
 #include "RAJA/internal/fault_tolerance.hpp"
-
-#include "RAJA/util/concepts.hpp"
-#include "RAJA/util/Span.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/policy/sequential/forall.hpp"
-
+#include "RAJA/internal/get_platform.hpp"
 #include "RAJA/pattern/detail/forall.hpp"
 #include "RAJA/pattern/detail/privatizer.hpp"
-
-#include "RAJA/internal/get_platform.hpp"
+#include "RAJA/policy/MultiPolicy.hpp"
+#include "RAJA/policy/PolicyBase.hpp"
+#include "RAJA/policy/sequential/forall.hpp"
+#include "RAJA/util/Span.hpp"
+#include "RAJA/util/concepts.hpp"
 #include "RAJA/util/plugins.hpp"
-
 #include "RAJA/util/resource.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -120,15 +111,31 @@ struct icount_adapter {
 };
 
 struct CallForall {
-  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+  template <typename T,
+            typename ExecPol,
+            typename Body,
+            typename Res,
+            typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&,
+                                                          ExecPol,
+                                                          Body,
+                                                          Res,
+                                                          ForallParams) const;
 };
 
 struct CallForallIcount {
   constexpr CallForallIcount(int s);
 
-  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+  template <typename T,
+            typename ExecPol,
+            typename Body,
+            typename Res,
+            typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&,
+                                                          ExecPol,
+                                                          Body,
+                                                          Res,
+                                                          ForallParams) const;
 
   const int start;
 };
@@ -152,12 +159,20 @@ namespace wrap
  *
  ******************************************************************************
  */
-template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody, typename ForallParams>
+template <typename Res,
+          typename ExecutionPolicy,
+          typename Container,
+          typename LoopBody,
+          typename ForallParams>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_range<Container>>
-forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallParams&& f_params)
+forall(Res r,
+       ExecutionPolicy&& p,
+       Container&& c,
+       LoopBody&& loop_body,
+       ForallParams&& f_params)
 {
   RAJA_FORCEINLINE_RECURSIVE
   return forall_impl(r,
@@ -167,7 +182,10 @@ forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallPa
                      std::forward<ForallParams>(f_params));
 }
 
-template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody>
+template <typename Res,
+          typename ExecutionPolicy,
+          typename Container,
+          typename LoopBody>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -197,11 +215,11 @@ template <typename Res,
           typename LoopBody,
           typename ForallParams>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
-                                                      ExecutionPolicy&& p,
-                                                      Container&& c,
-                                                      IndexType&& icount,
-                                                      LoopBody&& loop_body,
-                                                      ForallParams&& f_params)
+                                                     ExecutionPolicy&& p,
+                                                     Container&& c,
+                                                     IndexType&& icount,
+                                                     LoopBody&& loop_body,
+                                                     ForallParams&& f_params)
 {
   using std::begin;
   using std::distance;
@@ -212,7 +230,11 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
                                                                  icount);
   using policy::sequential::forall_impl;
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted, std::forward<ForallParams>(f_params));
+  return forall_impl(r,
+                     std::forward<ExecutionPolicy>(p),
+                     range,
+                     adapted,
+                     std::forward<ForallParams>(f_params));
 }
 
 /*!
@@ -230,15 +252,16 @@ template <typename Res,
           typename... SegmentTypes,
           typename LoopBody,
           typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
-                                                ExecPolicy<SegmentIterPolicy,
-                                                SegmentExecPolicy>,
-                                                const TypedIndexSet<SegmentTypes...>& iset,
-                                                LoopBody loop_body,
-                                                ForallParams f_params)
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(
+    Res r,
+    ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody loop_body,
+    ForallParams f_params)
 {
   // no need for icount variant here
-  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
+  auto segIterRes =
+      resources::get_resource<SegmentIterPolicy>::type::get_default();
   wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
     iset.segmentCall(segID,
                      detail::CallForallIcount(iset.getStartingIcount(segID)),
@@ -256,16 +279,22 @@ template <typename Res,
           typename LoopBody,
           typename... SegmentTypes,
           typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall(Res r,
-                                         ExecPolicy<SegmentIterPolicy,
-                                         SegmentExecPolicy>,
-                                         const TypedIndexSet<SegmentTypes...>& iset,
-                                         LoopBody loop_body,
-                                         ForallParams f_params)
+RAJA_INLINE resources::EventProxy<Res> forall(
+    Res r,
+    ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody loop_body,
+    ForallParams f_params)
 {
-  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
+  auto segIterRes =
+      resources::get_resource<SegmentIterPolicy>::type::get_default();
   wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
-    iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r, f_params);
+    iset.segmentCall(segID,
+                     detail::CallForall{},
+                     SegmentExecPolicy(),
+                     loop_body,
+                     r,
+                     f_params);
   });
   return RAJA::resources::EventProxy<Res>(r);
 }
@@ -273,13 +302,12 @@ RAJA_INLINE resources::EventProxy<Res> forall(Res r,
 }  // end namespace wrap
 
 
-
 /*!
  ******************************************************************************
  *
- * \brief The RAJA::policy_by_value_interface forall functions provide an interface with
- *        value-based policies. It also enforces the interface and performs
- *        static checks as well as triggering plugins and loop body updates.
+ * \brief The RAJA::policy_by_value_interface forall functions provide an
+ *interface with value-based policies. It also enforces the interface and
+ *performs static checks as well as triggering plugins and loop body updates.
  *
  ******************************************************************************
  */
@@ -294,7 +322,10 @@ inline namespace policy_by_value_interface
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
+template <typename ExecutionPolicy,
+          typename Res,
+          typename IdxSet,
+          typename... Params>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
                                                      Res r,
                                                      IdxSet&& c,
@@ -306,9 +337,10 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
-  //expt::check_forall_optional_args(loop_body, f_params);
+  // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -318,18 +350,21 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
 
   util::callPreLaunchPlugins(context);
 
-  RAJA::resources::EventProxy<Res> e = wrap::forall_Icount(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<IdxSet>(c),
-      std::move(body),
-      f_params);
+  RAJA::resources::EventProxy<Res> e =
+      wrap::forall_Icount(r,
+                          std::forward<ExecutionPolicy>(p),
+                          std::forward<IdxSet>(c),
+                          std::move(body),
+                          f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename IdxSet,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
                                                      IdxSet&& c,
                                                      LoopBody&& loop_body)
@@ -349,7 +384,10 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
+template <typename ExecutionPolicy,
+          typename Res,
+          typename IdxSet,
+          typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_indexset_policy<ExecutionPolicy>>
@@ -363,7 +401,8 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -373,18 +412,20 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e = wrap::forall(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<IdxSet>(c),
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e = wrap::forall(r,
+                                              std::forward<ExecutionPolicy>(p),
+                                              std::forward<IdxSet>(c),
+                                              std::move(body),
+                                              f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename IdxSet,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_indexset_policy<ExecutionPolicy>>
@@ -405,12 +446,14 @@ forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Container, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_multi_policy<ExecutionPolicy>,
-    type_traits::is_range<Container>>
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_multi_policy<ExecutionPolicy>,
+                                  type_traits::is_range<Container>>
 forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
@@ -420,9 +463,9 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 
   // plugins handled in multipolicy policy_invoker
   return forall_impl(r,
-              std::forward<ExecutionPolicy>(p),
-              std::forward<Container>(c),
-              std::forward<LoopBody>(loop_body));
+                     std::forward<ExecutionPolicy>(p),
+                     std::forward<Container>(c),
+                     std::forward<LoopBody>(loop_body));
 }
 
 /*!
@@ -438,10 +481,9 @@ template <typename ExecutionPolicy,
           typename IndexType,
           typename FirstParam,
           typename... Params>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_range<Container>,
-    type_traits::is_integral<IndexType>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_range<Container>,
+                                  type_traits::is_integral<IndexType>>
 forall_Icount(ExecutionPolicy&& p,
               Res r,
               Container&& c,
@@ -452,11 +494,14 @@ forall_Icount(ExecutionPolicy&& p,
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
 
-  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first), std::forward<Params>(params)...);
-  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first), std::forward<Params>(params)...);
-  //expt::check_forall_optional_args(loop_body, f_params);
+  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first),
+                                               std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first),
+                                      std::forward<Params>(params)...);
+  // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -466,22 +511,23 @@ forall_Icount(ExecutionPolicy&& p,
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e = wrap::forall_Icount(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<Container>(c),
-      icount,
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall_Icount(r,
+                          std::forward<ExecutionPolicy>(p),
+                          std::forward<Container>(c),
+                          icount,
+                          std::move(body),
+                          f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy,
-          typename Container,
-          typename IndexType,
-          typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename IndexType,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_range<Container>,
@@ -509,7 +555,10 @@ forall_Icount(ExecutionPolicy&& p,
  ******************************************************************************
  */
 
-template <typename ExecutionPolicy, typename Res, typename Container, typename... Params>
+template <typename ExecutionPolicy,
+          typename Res,
+          typename Container,
+          typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -524,7 +573,8 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -534,19 +584,21 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e =  wrap::forall(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<Container>(c),
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e = wrap::forall(r,
+                                              std::forward<ExecutionPolicy>(p),
+                                              std::forward<Container>(c),
+                                              std::move(body),
+                                              f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
 
-template <typename ExecutionPolicy, typename Container, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -562,7 +614,7 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
       std::forward<LoopBody>(loop_body));
 }
 
-}  // end inline namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 
 /*!
@@ -570,20 +622,25 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
 {
   Res r = Res::get_default();
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(),
+                                                   r,
+                                                   std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
 forall(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(),
+                                                   r,
+                                                   std::forward<Args>(args)...);
 }
 
 /*!
@@ -592,8 +649,10 @@ forall(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
 {
   Res r = Res::get_default();
@@ -601,7 +660,8 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
       ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
 forall_Icount(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -611,12 +671,17 @@ forall_Icount(Res r, Args&&... args)
 namespace detail
 {
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& segment,
-                                                               ExecutionPolicy,
-                                                               LoopBody body,
-                                                               Res r,
-                                                               ForallParams f_params) const
+template <typename T,
+          typename ExecutionPolicy,
+          typename LoopBody,
+          typename Res,
+          typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(
+    T const& segment,
+    ExecutionPolicy,
+    LoopBody body,
+    Res r,
+    ForallParams f_params) const
 {
   // this is only called inside a region, use impl
   using policy::sequential::forall_impl;
@@ -626,15 +691,21 @@ RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& seg
 
 constexpr CallForallIcount::CallForallIcount(int s) : start(s) {}
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T const& segment,
-                                                                     ExecutionPolicy,
-                                                                     LoopBody body,
-                                                                     Res r,
-                                                                     ForallParams f_params) const
+template <typename T,
+          typename ExecutionPolicy,
+          typename LoopBody,
+          typename Res,
+          typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(
+    T const& segment,
+    ExecutionPolicy,
+    LoopBody body,
+    Res r,
+    ForallParams f_params) const
 {
   // go through wrap to unwrap icount
-  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body, f_params);
+  return wrap::forall_Icount(
+      r, ExecutionPolicy(), segment, start, body, f_params);
 }
 
 }  // namespace detail
@@ -647,98 +718,112 @@ RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T cons
 // - Returns a generic event proxy only if a resource is provided
 //   avoids overhead of constructing a typed erased resource
 //
-template<camp::idx_t IDX, typename POLICY_LIST>
-struct dynamic_helper
-{
-  template<typename SEGMENT, typename... PARAMS>
-  static void invoke_forall(const int pol, SEGMENT const &seg, PARAMS&&... params)
+template <camp::idx_t IDX, typename POLICY_LIST>
+struct dynamic_helper {
+  template <typename SEGMENT, typename... PARAMS>
+  static void invoke_forall(const int pol,
+                            SEGMENT const& seg,
+                            PARAMS&&... params)
   {
-    if(IDX==pol){
-      using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
+    if (IDX == pol) {
+      using t_pol = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
       RAJA::forall<t_pol>(seg, params...);
       return;
     }
-    dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(pol, seg, params...);
+    dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(pol, seg, params...);
   }
 
-  template<typename SEGMENT, typename... PARAMS>
-  static resources::EventProxy<resources::Resource>
-  invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, PARAMS&&... params)
+  template <typename SEGMENT, typename... PARAMS>
+  static resources::EventProxy<resources::Resource> invoke_forall(
+      RAJA::resources::Resource r,
+      const int pol,
+      SEGMENT const& seg,
+      PARAMS&&... params)
   {
 
-    using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
+    using t_pol = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
     using resource_type = typename resources::get_resource<t_pol>::type;
 
-    if(IDX==pol){
+    if (IDX == pol) {
       RAJA::forall<t_pol>(r.get<resource_type>(), seg, params...);
 
-      //Return a generic event proxy from r,
-      //because forall returns a typed event proxy
+      // Return a generic event proxy from r,
+      // because forall returns a typed event proxy
       return {r};
     }
 
-    return dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(r, pol, seg, params...);
+    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(r,
+                                                               pol,
+                                                               seg,
+                                                               params...);
   }
-
 };
 
-template<typename POLICY_LIST>
-struct dynamic_helper<0, POLICY_LIST>
-{
-  template<typename SEGMENT, typename... PARAMS>
-  static void
-  invoke_forall(const int pol, SEGMENT const &seg, PARAMS&&... params)
+template <typename POLICY_LIST>
+struct dynamic_helper<0, POLICY_LIST> {
+  template <typename SEGMENT, typename... PARAMS>
+  static void invoke_forall(const int pol,
+                            SEGMENT const& seg,
+                            PARAMS&&... params)
   {
-    if(0==pol){
-      using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
+    if (0 == pol) {
+      using t_pol = typename camp::at<POLICY_LIST, camp::num<0>>::type;
       RAJA::forall<t_pol>(seg, params...);
       return;
     }
     RAJA_ABORT_OR_THROW("Policy enum not supported ");
   }
 
-  template<typename SEGMENT, typename... PARAMS>
-  static resources::EventProxy<resources::Resource>
-  invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, PARAMS&&... params)
+  template <typename SEGMENT, typename... PARAMS>
+  static resources::EventProxy<resources::Resource> invoke_forall(
+      RAJA::resources::Resource r,
+      const int pol,
+      SEGMENT const& seg,
+      PARAMS&&... params)
   {
-    if(pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
+    if (pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
 
-    using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
+    using t_pol = typename camp::at<POLICY_LIST, camp::num<0>>::type;
     using resource_type = typename resources::get_resource<t_pol>::type;
 
     RAJA::forall<t_pol>(r.get<resource_type>(), seg, params...);
 
-    //Return a generic event proxy from r,
-    //because forall returns a typed event proxy
+    // Return a generic event proxy from r,
+    // because forall returns a typed event proxy
     return {r};
   }
-
 };
 
-template<typename POLICY_LIST, typename SEGMENT, typename... PARAMS>
-void dynamic_forall(const int pol, SEGMENT const &seg, PARAMS&&... params)
+template <typename POLICY_LIST, typename SEGMENT, typename... PARAMS>
+void dynamic_forall(const int pol, SEGMENT const& seg, PARAMS&&... params)
 {
   constexpr int N = camp::size<POLICY_LIST>::value;
   static_assert(N > 0, "RAJA policy list must not be empty");
 
-  if(pol > N-1)  {
+  if (pol > N - 1) {
     RAJA_ABORT_OR_THROW("Policy enum not supported");
   }
-  dynamic_helper<N-1, POLICY_LIST>::invoke_forall(pol, seg, params...);
+  dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(pol, seg, params...);
 }
 
-template<typename POLICY_LIST, typename SEGMENT, typename... PARAMS>
-resources::EventProxy<resources::Resource>
-dynamic_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, PARAMS&&... params)
+template <typename POLICY_LIST, typename SEGMENT, typename... PARAMS>
+resources::EventProxy<resources::Resource> dynamic_forall(
+    RAJA::resources::Resource r,
+    const int pol,
+    SEGMENT const& seg,
+    PARAMS&&... params)
 {
   constexpr int N = camp::size<POLICY_LIST>::value;
   static_assert(N > 0, "RAJA policy list must not be empty");
 
-  if(pol > N-1)  {
+  if (pol > N - 1) {
     RAJA_ABORT_OR_THROW("Policy value out of range");
   }
 
-  return dynamic_helper<N-1, POLICY_LIST>::invoke_forall(r, pol, seg, params...);
+  return dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(r,
+                                                           pol,
+                                                           seg,
+                                                           params...);
 }
 
 
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index 1875fe27d9..df045e3440 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -19,19 +19,15 @@
 #define RAJA_pattern_kernel_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/internal/get_platform.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
 #include "RAJA/util/plugins.hpp"
-
+#include "RAJA/util/types.hpp"
 #include "camp/camp.hpp"
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/pattern/kernel/internal.hpp"
-
 namespace RAJA
 {
 
@@ -57,9 +53,8 @@ struct IterableWrapperTuple;
 template <typename... Ts>
 struct IterableWrapperTuple<camp::tuple<Ts...>> {
 
-  using type =
-      camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
-                             typename camp::decay<Ts>::IndexType>...>;
+  using type = camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
+                                      typename camp::decay<Ts>::IndexType>...>;
 };
 
 
@@ -75,12 +70,12 @@ RAJA_INLINE constexpr auto make_wrapped_tuple_impl(Tuple &&t,
             camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
 {
   return camp::make_tuple(
-      RAJA::Span<
-          typename camp::decay<
-              camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-          typename camp::decay<camp::tuple_element_t<I, camp::decay<Tuple>>>::
-              IndexType>{camp::get<I>(std::forward<Tuple>(t)).begin(),
-                         camp::get<I>(std::forward<Tuple>(t)).end()}...);
+      RAJA::Span<typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+                 typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>{
+          camp::get<I>(std::forward<Tuple>(t)).begin(),
+          camp::get<I>(std::forward<Tuple>(t)).end()}...);
 }
 }  // namespace internal
 
@@ -101,10 +96,11 @@ template <typename PolicyType,
           typename ParamTuple,
           typename Resource,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &&segments,
-                                                                  ParamTuple &&params,
-                                                                  Resource resource,
-                                                                  Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(
+    SegmentTuple &&segments,
+    ParamTuple &&params,
+    Resource resource,
+    Bodies &&...bodies)
 {
   util::PluginContext context{util::make_context<PolicyType>()};
 
@@ -133,9 +129,9 @@ RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &
   // and only copied to provide thread-private instances.
   loop_data_t loop_data(make_wrapped_tuple(
                             std::forward<SegmentTuple>(segments)),
-                            std::forward<ParamTuple>(params),
-                            resource,
-                            std::forward<Bodies>(bodies)...);
+                        std::forward<ParamTuple>(params),
+                        resource,
+                        std::forward<Bodies>(bodies)...);
 
   util::callPostCapturePlugins(context);
 
@@ -156,40 +152,43 @@ template <typename PolicyType,
           typename SegmentTuple,
           typename Resource,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource> kernel_resource(SegmentTuple &&segments,
-                                                            Resource resource,
-                                                            Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<Resource> kernel_resource(
+    SegmentTuple &&segments,
+    Resource resource,
+    Bodies &&...bodies)
 {
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 RAJA::make_tuple(),
-                                                 resource,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments),
+      RAJA::make_tuple(),
+      resource,
+      std::forward<Bodies>(bodies)...);
 }
 
 template <typename PolicyType,
           typename SegmentTuple,
           typename ParamTuple,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel_param(SegmentTuple &&segments,
-                                                                                           ParamTuple &&params,
-                                                                                           Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
+kernel_param(SegmentTuple &&segments, ParamTuple &&params, Bodies &&...bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 std::forward<ParamTuple>(params),
-                                                 res,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments),
+      std::forward<ParamTuple>(params),
+      res,
+      std::forward<Bodies>(bodies)...);
 }
 
 template <typename PolicyType, typename SegmentTuple, typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel(SegmentTuple &&segments,
-                                                                                     Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
+kernel(SegmentTuple &&segments, Bodies &&...bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 RAJA::make_tuple(),
-                                                 res,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments),
+      RAJA::make_tuple(),
+      res,
+      std::forward<Bodies>(bodies)...);
 }
 
 
diff --git a/include/RAJA/pattern/kernel/Conditional.hpp b/include/RAJA/pattern/kernel/Conditional.hpp
index 6b7875c4c2..77eaef8d80 100644
--- a/include/RAJA/pattern/kernel/Conditional.hpp
+++ b/include/RAJA/pattern/kernel/Conditional.hpp
@@ -19,13 +19,12 @@
 #define RAJA_pattern_kernel_Conditional_HPP
 
 
-#include "RAJA/config.hpp"
-
-#include "RAJA/pattern/kernel/internal.hpp"
-
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
+
 namespace RAJA
 {
 namespace statement
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index 539c451673..9ca9a80ed0 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -18,11 +18,10 @@
 #ifndef RAJA_pattern_kernel_For_HPP
 #define RAJA_pattern_kernel_For_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/internal.hpp"
 
 namespace RAJA
@@ -59,7 +58,10 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
 struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
@@ -85,7 +87,8 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>, Types> {
+    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>,
+    Types> {
 
 
   template <typename Data>
@@ -103,7 +106,11 @@ struct StatementExecutor<
 
     auto r = data.res;
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r,
+                ExecPolicy{},
+                TypedRangeSegment<len_t>(0, len),
+                for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
@@ -112,11 +119,9 @@ struct StatementExecutor<
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<
-    statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types> {
+template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+                         Types> {
 
 
   template <typename Data>
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index 18515c7f59..e4b2b5b44c 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -18,13 +18,12 @@
 #ifndef RAJA_pattern_kernel_ForICount_HPP
 #define RAJA_pattern_kernel_ForICount_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/Param.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
 
 namespace RAJA
 {
@@ -44,8 +43,8 @@ template <camp::idx_t ArgumentId,
           typename ExecPolicy = camp::nil,
           typename... EnclosedStmts>
 struct ForICount : public internal::ForList,
-             public internal::ForTraitBase<ArgumentId, ExecPolicy>,
-             public internal::Statement<ExecPolicy, EnclosedStmts...> {
+                   public internal::ForTraitBase<ArgumentId, ExecPolicy>,
+                   public internal::Statement<ExecPolicy, EnclosedStmts...> {
 
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
@@ -64,7 +63,10 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Data,
+          typename Types,
           typename... EnclosedStmts>
 struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
@@ -93,7 +95,8 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>, Types> {
+    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>,
+    Types> {
 
 
   template <typename Data>
@@ -104,15 +107,19 @@ struct StatementExecutor<
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
     // Create a wrapper, just in case forall_impl needs to thread_privatize
-    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes,
-                     EnclosedStmts...> for_wrapper(data);
+    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes, EnclosedStmts...>
+        for_wrapper(data);
 
     auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::get_resource<ExecPolicy>::type::get_default();
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r,
+                ExecPolicy{},
+                TypedRangeSegment<len_t>(0, len),
+                for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 955afcecc0..bc7e752a6d 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -18,16 +18,14 @@
 #ifndef RAJA_pattern_kernel_Hyperplane_HPP
 #define RAJA_pattern_kernel_Hyperplane_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "camp/camp.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/For.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -81,9 +79,7 @@ template <camp::idx_t HpArgumentId,
           typename ArgList,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct Hyperplane
-    : public internal::Statement<ExecPolicy,
-                                 EnclosedStmts...> {
+struct Hyperplane : public internal::Statement<ExecPolicy, EnclosedStmts...> {
 };
 
 }  // end namespace statement
@@ -108,7 +104,8 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
                                                HpExecPolicy,
                                                ArgList<Args...>,
                                                ExecPolicy,
-                                               EnclosedStmts...>, Types> {
+                                               EnclosedStmts...>,
+                         Types> {
 
 
   template <typename Data>
@@ -135,9 +132,9 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
 
     // compute manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    idx_t hp_len = segment_length<HpArgumentId>(data) +
-                   foldl(RAJA::operators::plus<idx_t>(),
-                                 segment_length<Args>(data)...);
+    idx_t hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<idx_t>(), segment_length<Args>(data)...);
 
     /* Execute the outer loop over hyperplanes
      *
@@ -146,7 +143,8 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
      * arguments actual value (and restrict to valid hyperplane indices)
      */
     auto r = resources::get_resource<HpExecPolicy>::type::get_default();
-    forall_impl(r, HpExecPolicy{},
+    forall_impl(r,
+                HpExecPolicy{},
                 TypedRangeSegment<idx_t>(0, hp_len),
                 outer_wrapper,
                 RAJA::expt::get_empty_forall_param_pack());
@@ -159,7 +157,8 @@ template <camp::idx_t HpArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>, Types> {
+    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>,
+    Types> {
 
 
   template <typename Data>
@@ -173,7 +172,7 @@ struct StatementExecutor<
     // compute actual iterate for HpArgumentId
     // as:  i0 = h - (i1 + i2 + i3 + ...)
     idx_t i = h - foldl(RAJA::operators::plus<idx_t>(),
-                                camp::get<Args>(data.offset_tuple)...);
+                        camp::get<Args>(data.offset_tuple)...);
 
     // get length of Hp indexed argument
     auto len = segment_length<HpArgumentId>(data);
diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp
index 21d9e3cd2a..c13e4c6fbf 100644
--- a/include/RAJA/pattern/kernel/InitLocalMem.hpp
+++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp
@@ -18,15 +18,15 @@
 #ifndef RAJA_pattern_kernel_InitLocalMem_HPP
 #define RAJA_pattern_kernel_InitLocalMem_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
+
 namespace RAJA
 {
 
-//Policies for RAJA local arrays
+// Policies for RAJA local arrays
 struct cpu_tile_mem;
 
 
@@ -43,13 +43,15 @@ namespace statement
  * IntiLocalMem<Pol, RAJA::param_idx<0>, statements...>
  * Will intialize the 0th array in the param tuple
  */
-template<typename Pol, typename Indices, typename... EnclosedStmts>
+template <typename Pol, typename Indices, typename... EnclosedStmts>
 struct InitLocalMem : public internal::Statement<camp::nil> {
 };
 
-//Policy Specialization
-template<camp::idx_t... Indices, typename... EnclosedStmts>
-struct InitLocalMem<RAJA::cpu_tile_mem, camp::idx_seq<Indices...>, EnclosedStmts...> : public internal::Statement<camp::nil> {
+// Policy Specialization
+template <camp::idx_t... Indices, typename... EnclosedStmts>
+struct InitLocalMem<RAJA::cpu_tile_mem,
+                    camp::idx_seq<Indices...>,
+                    EnclosedStmts...> : public internal::Statement<camp::nil> {
 };
 
 
@@ -58,23 +60,28 @@ struct InitLocalMem<RAJA::cpu_tile_mem, camp::idx_seq<Indices...>, EnclosedStmts
 namespace internal
 {
 
-//Statement executor to initalize RAJA local array
-template<camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_seq<Indices...>, EnclosedStmts...>, Types>{
-  
-  //Execute statement list
-  template<class Data>
-  static void RAJA_INLINE exec_expanded(Data && data)
+// Statement executor to initalize RAJA local array
+template <camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,
+                                                 camp::idx_seq<Indices...>,
+                                                 EnclosedStmts...>,
+                         Types> {
+
+  // Execute statement list
+  template <class Data>
+  static void RAJA_INLINE exec_expanded(Data &&data)
   {
     execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
   }
-  
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t... others, class Data>
-  static void RAJA_INLINE exec_expanded(Data && data)
+
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t... others, class Data>
+  static void RAJA_INLINE exec_expanded(Data &&data)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
 
     // Initialize memory
 #ifdef RAJA_COMPILER_MSVC
@@ -95,16 +102,14 @@ struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_se
     delete[] ptr;
 #endif
   }
-  
 
-  
-  template<typename Data>
+
+  template <typename Data>
   static RAJA_INLINE void exec(Data &&data)
   {
-    //Initalize local arrays + execute statements + cleanup
+    // Initalize local arrays + execute statements + cleanup
     exec_expanded<Indices...>(data);
   }
-  
 };
 
 
diff --git a/include/RAJA/pattern/kernel/Lambda.hpp b/include/RAJA/pattern/kernel/Lambda.hpp
index 29d41b431e..affc4faaf8 100644
--- a/include/RAJA/pattern/kernel/Lambda.hpp
+++ b/include/RAJA/pattern/kernel/Lambda.hpp
@@ -18,56 +18,51 @@
 #ifndef RAJA_pattern_kernel_Lambda_HPP
 #define RAJA_pattern_kernel_Lambda_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 #include "camp/camp.hpp"
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/pattern/kernel/internal.hpp"
-
 namespace RAJA
 {
 
 namespace internal
 {
-struct lambda_arg_seg_t
-{};
-
-struct lambda_arg_param_t
-{};
+struct lambda_arg_seg_t {
+};
 
-struct lambda_arg_offset_t
-{};
+struct lambda_arg_param_t {
+};
 
-template<typename T>
-struct lambda_arg_value_t
-{
-    using type = T;
+struct lambda_arg_offset_t {
 };
 
-template<typename T, camp::idx_t V>
-struct LambdaArg
-{
-    static constexpr camp::idx_t value = V;
+template <typename T>
+struct lambda_arg_value_t {
+  using type = T;
 };
 
-}
+template <typename T, camp::idx_t V>
+struct LambdaArg {
+  static constexpr camp::idx_t value = V;
+};
 
+}  // namespace internal
 
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment values
  * should be passed into the lambda as an argument
  */
-template<camp::idx_t ... args>
-using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
+template <camp::idx_t... args>
+using Segs =
+    camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment offsets
@@ -79,16 +74,18 @@ using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...
  * In the case of tiling (with Tile) the offset is w.r.t. the beginning of the
  * current tile.
  */
-template<camp::idx_t ... args>
-using Offsets = camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
+template <camp::idx_t... args>
+using Offsets =
+    camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
 
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more parameters that
  * should be passed into the lambda as an argument.
  */
-template<camp::idx_t ... args>
-using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
+template <camp::idx_t... args>
+using Params =
+    camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more constant values
@@ -103,8 +100,9 @@ using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args
  * writing:   Lambda<0, ValuesT<double, 3, 4>>
  * invokes:   lambda0( (double)3, (double) 4 )
  */
-template<typename T, camp::idx_t ... values>
-using ValuesT = camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
+template <typename T, camp::idx_t... values>
+using ValuesT =
+    camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
 
 
 namespace statement
@@ -119,7 +117,7 @@ namespace statement
  * RAJA::kernel<exec_pol>(make_tuple{s0, s1, s2}, lambda0, lambda1);
  *
  */
-template <camp::idx_t BodyIdx, typename... Args >
+template <camp::idx_t BodyIdx, typename... Args>
 struct Lambda : internal::Statement<camp::nil> {
   static const camp::idx_t loop_body_index = BodyIdx;
 };
@@ -130,13 +128,6 @@ namespace internal
 {
 
 
-
-
-
-
-
-
-
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -146,26 +137,22 @@ namespace internal
  * This class allows specialization on the segment type in LoopTypes so that
  * fancier constructions can happen (ie vector_exec, etc.)
  */
-template<typename SegmentType, camp::idx_t id>
-struct LambdaSegExtractor
-{
+template <typename SegmentType, camp::idx_t id>
+struct LambdaSegExtractor {
 
   static_assert(!std::is_same<SegmentType, void>::value,
-      "Segment not assigned, but used in Lambda with Segs<> argument");
+                "Segment not assigned, but used in Lambda with Segs<> "
+                "argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static SegmentType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data &&data)
   {
-    return SegmentType(camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)]);
+    return SegmentType(camp::get<id>(data.segment_tuple)
+                           .begin()[camp::get<id>(data.offset_tuple)]);
   }
-
 };
 
 
-
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -175,26 +162,21 @@ struct LambdaSegExtractor
  * This class allows specialization on the segment type in LoopTypes so that
  * fancier constructions can happen (ie vector_exec, etc.)
  */
-template<typename OffsetType, camp::idx_t id>
-struct LambdaOffsetExtractor
-{
+template <typename OffsetType, camp::idx_t id>
+struct LambdaOffsetExtractor {
 
   static_assert(!std::is_same<OffsetType, void>::value,
-      "Segment not assigned, but used in Lambda with Offsets<> argument");
+                "Segment not assigned, but used in Lambda with Offsets<> "
+                "argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static OffsetType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data &&data)
   {
     return OffsetType(camp::get<id>(data.offset_tuple));
   }
-
 };
 
 
-
 /*
  * Helper that provides first level of argument extraction
  * This acts as a switchboard between Segs, Offsets, and Params
@@ -202,121 +184,118 @@ struct LambdaOffsetExtractor
  * It calls LambdaArgExtractor to perform the actual argument extraction.
  * This allows LambdaArgExtractor to be specialized
  */
-template<typename Types, typename T>
+template <typename Types, typename T>
 struct LambdaArgSwitchboard;
 
 
-template<typename Types, camp::idx_t id>
-struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_offset_t, id>>
-{
+template <typename Types, camp::idx_t id>
+struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_offset_t, id>> {
 
   using OffsetType = camp::at_v<typename Types::offset_types_t, id>;
 
   static_assert(!std::is_same<OffsetType, void>::value,
-      "Offset not assigned, but used in Lambda with Offsets<> argument");
+                "Offset not assigned, but used in Lambda with Offsets<> "
+                "argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static OffsetType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data &&data)
   {
-    return LambdaOffsetExtractor<OffsetType, id>::extract(std::forward<Data>(data));
+    return LambdaOffsetExtractor<OffsetType, id>::extract(
+        std::forward<Data>(data));
   }
-
 };
 
-template<typename Types, camp::idx_t id>
-struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_seg_t, id>>
-{
+template <typename Types, camp::idx_t id>
+struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_seg_t, id>> {
 
   using SegmentType = camp::at_v<typename Types::segment_types_t, id>;
 
   static_assert(!std::is_same<SegmentType, void>::value,
-      "Segment not assigned, but used in Lambda with Segs<> argument");
+                "Segment not assigned, but used in Lambda with Segs<> "
+                "argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static SegmentType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data &&data)
   {
-    return LambdaSegExtractor<SegmentType, id>::extract(std::forward<Data>(data));
+    return LambdaSegExtractor<SegmentType, id>::extract(
+        std::forward<Data>(data));
   }
-
 };
 
-template<typename Types, camp::idx_t id>
-struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_param_t, id>>
-{
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static auto extract(Data &&data)->
-    typename std::add_lvalue_reference<camp::tuple_element_t<id,typename camp::decay<Data>::param_tuple_t>>::type
+template <typename Types, camp::idx_t id>
+struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_param_t, id>> {
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static auto extract(Data &&data) ->
+      typename std::add_lvalue_reference<camp::tuple_element_t<
+          id,
+          typename camp::decay<Data>::param_tuple_t>>::type
   {
     return camp::get<id>(data.param_tuple);
   }
 };
 
 
-template<typename Types, typename T, camp::idx_t value>
-struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_value_t<T>, value>>
-{
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static T extract(Data &&)
+template <typename Types, typename T, camp::idx_t value>
+struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_value_t<T>, value>> {
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static T extract(Data &&)
   {
     return T(value);
   }
 };
 
 
-
 RAJA_SUPPRESS_HD_WARN
-template<camp::idx_t LoopIndex, typename Types, typename Data, typename... targLists>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda_with_args(Data &&data,
-                                                       camp::list<targLists...> const &)
+template <camp::idx_t LoopIndex,
+          typename Types,
+          typename Data,
+          typename... targLists>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda_with_args(
+    Data &&data,
+    camp::list<targLists...> const &)
 {
   camp::get<LoopIndex>(data.bodies)(
       LambdaArgSwitchboard<Types, targLists>::extract(data)...);
 }
 
 
-
-
 /*!
  * A RAJA::kernel statement that invokes a lambda function
  * with user specified arguments.
  */
-template <camp::idx_t LambdaIndex,typename... Args, typename Types>
+template <camp::idx_t LambdaIndex, typename... Args, typename Types>
 struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types> {
 
   template <typename Data>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
   {
 
-    //Convert SegList, ParamList into Seg, Param types, and store in a list
+    // Convert SegList, ParamList into Seg, Param types, and store in a list
     using targList = typename camp::flatten<camp::list<Args...>>::type;
 
-    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data), targList{});
+    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data),
+                                                targList{});
   }
 };
 
 
-
-template <camp::idx_t LambdaIndex, typename Types, typename Data, camp::idx_t ... SEGS, camp::idx_t ... PARAMS>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data &&data, camp::idx_seq<SEGS...> const &, camp::idx_seq<PARAMS...> const &)
+template <camp::idx_t LambdaIndex,
+          typename Types,
+          typename Data,
+          camp::idx_t... SEGS,
+          camp::idx_t... PARAMS>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(
+    Data &&data,
+    camp::idx_seq<SEGS...> const &,
+    camp::idx_seq<PARAMS...> const &)
 {
 
   using AllSegs = Segs<SEGS...>;
   using AllParams = Params<PARAMS...>;
 
   // invoke the expanded Lambda executor, passing in all segments and params
-  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>, Types>::exec(std::forward<Data>(data));
+  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>,
+                    Types>::exec(std::forward<Data>(data));
 }
 
 
@@ -335,7 +314,6 @@ struct StatementExecutor<statement::Lambda<LambdaIndex>, Types> {
         std::forward<Data>(data),
         camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{},
         camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value>{});
-
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Param.hpp b/include/RAJA/pattern/kernel/Param.hpp
index 8e870ebe15..6e41382f5b 100644
--- a/include/RAJA/pattern/kernel/Param.hpp
+++ b/include/RAJA/pattern/kernel/Param.hpp
@@ -19,13 +19,12 @@
 #define RAJA_pattern_kernel_Param_HPP
 
 
-#include "RAJA/config.hpp"
-
-#include "RAJA/pattern/kernel/internal.hpp"
-
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
+
 namespace RAJA
 {
 namespace internal
@@ -34,7 +33,7 @@ namespace internal
 struct ParamBase {
 };
 
-}// end namespace internal
+}  // end namespace internal
 
 namespace statement
 {
diff --git a/include/RAJA/pattern/kernel/Reduce.hpp b/include/RAJA/pattern/kernel/Reduce.hpp
index 4de4922ea3..8c27efba9c 100644
--- a/include/RAJA/pattern/kernel/Reduce.hpp
+++ b/include/RAJA/pattern/kernel/Reduce.hpp
@@ -18,11 +18,10 @@
 #ifndef RAJA_pattern_kernel_Reduce_HPP
 #define RAJA_pattern_kernel_Reduce_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/internal.hpp"
 
 namespace RAJA
@@ -39,7 +38,8 @@ namespace statement
  *
  */
 template <typename ReducePolicy,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts>
 struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...> {
diff --git a/include/RAJA/pattern/kernel/Region.hpp b/include/RAJA/pattern/kernel/Region.hpp
index 82b79ae775..33e56dde72 100644
--- a/include/RAJA/pattern/kernel/Region.hpp
+++ b/include/RAJA/pattern/kernel/Region.hpp
@@ -18,19 +18,19 @@
 #ifndef RAJA_pattern_kernel_region_HPP
 #define RAJA_pattern_kernel_region_HPP
 
-#include "RAJA/config.hpp"
-#include "RAJA/pattern/region.hpp"
-
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
+#include "RAJA/pattern/region.hpp"
+
 namespace RAJA
 {
 
 namespace statement
 {
 
-template<typename RegionPolicy, typename... EnclosedStmts>
+template <typename RegionPolicy, typename... EnclosedStmts>
 struct Region : public internal::Statement<camp::nil> {
 };
 
@@ -40,23 +40,23 @@ struct Region : public internal::Statement<camp::nil> {
 namespace internal
 {
 
-//Statement executor to create a region within kernel
+// Statement executor to create a region within kernel
 
-//Note: RAJA region's lambda must capture by reference otherwise
-//internal function calls are undefined.
-template<typename RegionPolicy, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>, Types> {
+// Note: RAJA region's lambda must capture by reference otherwise
+// internal function calls are undefined.
+template <typename RegionPolicy, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>,
+                         Types> {
 
-template<typename Data>
-static RAJA_INLINE void exec(Data &&data)
-{
+  template <typename Data>
+  static RAJA_INLINE void exec(Data &&data)
+  {
 
-  RAJA::region<RegionPolicy>([&]() {
+    RAJA::region<RegionPolicy>([&]() {
       using data_t = camp::decay<Data>;
       execute_statement_list<camp::list<EnclosedStmts...>, Types>(data_t(data));
     });
-}
-
+  }
 };
 
 
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 43f72e0545..54a6da0244 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -18,18 +18,16 @@
 #ifndef RAJA_pattern_kernel_Tile_HPP
 #define RAJA_pattern_kernel_Tile_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "camp/camp.hpp"
-#include "camp/concepts.hpp"
-#include "camp/tuple.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/internal.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
+#include "camp/concepts.hpp"
+#include "camp/tuple.hpp"
 
 namespace RAJA
 {
@@ -39,9 +37,7 @@ struct TileSize {
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr TileSize(camp::idx_t size_) : size{size_}
-  {
-  }
+  constexpr TileSize(camp::idx_t size_) : size{size_} {}
 };
 
 namespace statement
@@ -75,7 +71,6 @@ struct tile_dynamic {
 };
 
 
-
 namespace internal
 {
 
@@ -84,7 +79,10 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
 struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
@@ -107,8 +105,7 @@ template <typename Iterable>
 struct IterableTiler {
   using value_type = camp::decay<Iterable>;
 
-  struct iterate
-  {
+  struct iterate {
     value_type s;
     Index_type i;
   };
@@ -222,7 +219,8 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>, Types> {
+    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>,
+    Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &data)
@@ -238,24 +236,29 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types,
-                EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r,
+                EPol{},
+                tiled_iterable,
+                tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
 };
 
-template<camp::idx_t ArgumentId,
-  typename EPol,
-  typename... EnclosedStmts,
-  typename Types>
+template <camp::idx_t ArgumentId,
+          typename EPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>, Types> {
+    statement::
+        Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>,
+    Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &data)
@@ -265,20 +268,24 @@ struct StatementExecutor<
 
     // Get the tiling policies chunk size
     auto chunk_size = camp::get<ArgumentId>(data.param_tuple);
-    static_assert(camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
-                  "Extracted parameter must be of type TileSize.");
+    static_assert(
+        camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
+        "Extracted parameter must be of type TileSize.");
 
     // Create a tile iterator
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size.size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types,
-                EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
-    
+    forall_impl(r,
+                EPol{},
+                tiled_iterable,
+                tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
+
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp
index 2653e992c7..2b91186da6 100644
--- a/include/RAJA/pattern/kernel/TileTCount.hpp
+++ b/include/RAJA/pattern/kernel/TileTCount.hpp
@@ -18,18 +18,16 @@
 #ifndef RAJA_pattern_kernel_TileTCount_HPP
 #define RAJA_pattern_kernel_TileTCount_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "camp/camp.hpp"
-#include "camp/concepts.hpp"
-#include "camp/tuple.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/internal.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
+#include "camp/concepts.hpp"
+#include "camp/tuple.hpp"
 
 namespace RAJA
 {
@@ -66,9 +64,13 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Data,
+          typename Types,
           typename... EnclosedStmts>
-struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+struct TileTCountWrapper
+    : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -79,17 +81,16 @@ struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
   {
     // Assign the tile's segment to the tuple
     camp::get<ArgumentId>(Base::data.segment_tuple) = si.s;
-    
+
     // Assign the tile's index
     Base::data.template assign_param<ParamId>(si.i);
-    
+
     // Execute enclosed statements
     Base::exec();
   }
 };
 
 
-
 /*!
  * A generic RAJA::kernel forall_impl executor for statement::TileTCount
  *
@@ -102,7 +103,8 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>, Types> {
+    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>,
+    Types> {
 
 
   template <typename Data>
@@ -119,12 +121,16 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileTCountWrapper<ArgumentId, ParamId, Data, Types,
-                      EnclosedStmts...> tile_wrapper(data);
+    TileTCountWrapper<ArgumentId, ParamId, Data, Types, EnclosedStmts...>
+        tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r,
+                EPol{},
+                tiled_iterable,
+                tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
index 9667a55538..86c2b67e96 100644
--- a/include/RAJA/pattern/kernel/internal/LoopData.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -19,20 +19,17 @@
 #ifndef RAJA_pattern_kernel_internal_LoopData_HPP
 #define RAJA_pattern_kernel_internal_LoopData_HPP
 
-#include "RAJA/config.hpp"
+#include <iterator>
+#include <type_traits>
 
+#include "RAJA/config.hpp"
 #include "RAJA/index/IndexSet.hpp"
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "camp/camp.hpp"
-
 #include "RAJA/pattern/detail/privatizer.hpp"
 #include "RAJA/pattern/kernel/internal/StatementList.hpp"
 #include "RAJA/pattern/kernel/internal/Template.hpp"
-
-#include <iterator>
-#include <type_traits>
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -40,25 +37,21 @@ namespace internal
 {
 
 
-
-
-  // Universal base of all For wrappers for type traits
-  struct ForList {
-  };
-  struct ForBase {
-  };
-  struct CollapseBase {
-  };
-  template <camp::idx_t ArgumentId, typename Policy>
-  struct ForTraitBase : public ForBase {
-    constexpr static camp::idx_t index_val = ArgumentId;
-    using index = camp::num<ArgumentId>;
-    using index_type = camp::nil;  // default to invalid type
-    using policy_type = Policy;
-    using type = ForTraitBase;  // make camp::value compatible
-  };
-
-
+// Universal base of all For wrappers for type traits
+struct ForList {
+};
+struct ForBase {
+};
+struct CollapseBase {
+};
+template <camp::idx_t ArgumentId, typename Policy>
+struct ForTraitBase : public ForBase {
+  constexpr static camp::idx_t index_val = ArgumentId;
+  using index = camp::num<ArgumentId>;
+  using index_type = camp::nil;  // default to invalid type
+  using policy_type = Policy;
+  using type = ForTraitBase;  // make camp::value compatible
+};
 
 
 template <typename Iterator>
@@ -100,8 +93,6 @@ using index_types_from_segments =
                            value_type_list_from_segments<Segments>>::type;
 
 
-
-
 template <typename SegmentTuple,
           typename ParamTuple,
           typename Resource,
@@ -138,8 +129,10 @@ struct LoopData {
   using vector_sizes_t = tuple_of_n<int, camp::tuple_size<SegmentTuple>::value>;
   vector_sizes_t vector_sizes;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  LoopData(SegmentTuple const &s, ParamTuple const &p, Resource r, Bodies const &... b)
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr LoopData(SegmentTuple const &s,
+                                                  ParamTuple const &p,
+                                                  Resource r,
+                                                  Bodies const &...b)
       : segment_tuple(s), param_tuple(p), res(r), bodies(b...)
   {
   }
@@ -155,50 +148,37 @@ struct LoopData {
   template <typename ParamId, typename IndexT>
   RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const &i)
   {
-    using param_t = camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
+    using param_t =
+        camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
     camp::get<ParamId::param_idx>(param_tuple) = param_t(i);
   }
 
   template <typename ParamId>
-  RAJA_HOST_DEVICE RAJA_INLINE
-  auto get_param() ->
-    camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
+  RAJA_HOST_DEVICE RAJA_INLINE auto get_param()
+      -> camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
   {
     return camp::get<ParamId::param_idx>(param_tuple);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  Resource get_resource()
-  {
-    return res;
-  }
-
-
+  RAJA_HOST_DEVICE RAJA_INLINE Resource get_resource() { return res; }
 };
 
 
-
-
 template <camp::idx_t ArgumentId, typename Data>
-using segment_diff_type =
-    typename std::iterator_traits<
-        typename camp::at_v<typename Data::segment_tuple_t::TList,
-                            ArgumentId>::iterator>::difference_type;
-
-
+using segment_diff_type = typename std::iterator_traits<
+    typename camp::at_v<typename Data::segment_tuple_t::TList,
+                        ArgumentId>::iterator>::difference_type;
 
 
 template <camp::idx_t ArgumentId, typename Data>
-RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const &data) ->
-  segment_diff_type<ArgumentId, Data>
+RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const &data)
+    -> segment_diff_type<ArgumentId, Data>
 {
   return camp::get<ArgumentId>(data.segment_tuple).end() -
          camp::get<ArgumentId>(data.segment_tuple).begin();
 }
 
 
-
-
 template <typename Data, typename Types, typename... EnclosedStmts>
 struct GenericWrapper : GenericWrapperBase {
   using data_t = camp::decay<Data>;
@@ -209,7 +189,10 @@ struct GenericWrapper : GenericWrapperBase {
   constexpr explicit GenericWrapper(data_t &d) : data{d} {}
 
   RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
+  void exec()
+  {
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
+  }
 };
 
 
@@ -236,7 +219,6 @@ struct NestedPrivatizer {
 };
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
index 7f77df4214..271ffb7214 100644
--- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -29,63 +29,69 @@ namespace internal
 {
 
 
-template <typename SegmentTypes,
-          typename OffsetTypes>
+template <typename SegmentTypes, typename OffsetTypes>
 struct LoopTypes;
 
-template <typename ... SegmentTypes,
-          typename ... OffsetTypes>
+template <typename... SegmentTypes, typename... OffsetTypes>
 struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>> {
 
-  using Self = LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
+  using Self =
+      LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
 
   static constexpr size_t s_num_segments = sizeof...(SegmentTypes);
 
   // This ensures that you don't double-loop over a segment within the same
   // loop nesting
   static_assert(s_num_segments == sizeof...(OffsetTypes),
-      "Number of segments and offsets must match");
+                "Number of segments and offsets must match");
 
   using segment_types_t = camp::list<SegmentTypes...>;
   using offset_types_t = camp::list<OffsetTypes...>;
 };
 
 
-template<typename Data>
-using makeInitialLoopTypes =
-    LoopTypes<list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
-              list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
+template <typename Data>
+using makeInitialLoopTypes = LoopTypes<
+    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
+    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
 
 
-template<typename Types, camp::idx_t Segment, typename T, typename Seq>
+template <typename Types, camp::idx_t Segment, typename T, typename Seq>
 struct SetSegmentTypeHelper;
 
-template<typename Types,
-         camp::idx_t Segment,
-         typename T,
-         camp::idx_t ... SEQ>
-struct SetSegmentTypeHelper<Types, Segment, T, camp::idx_seq<SEQ...>>
-{
-    using segment_list = typename Types::segment_types_t;
-    using offset_list = typename Types::offset_types_t;
-
-    static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
-        "Segment was already assigned: Probably looping over same segment in loop nest");
-
-    using type = LoopTypes<
-        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>,
-        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>>;
-
+template <typename Types, camp::idx_t Segment, typename T, camp::idx_t... SEQ>
+struct SetSegmentTypeHelper<Types, Segment, T, camp::idx_seq<SEQ...>> {
+  using segment_list = typename Types::segment_types_t;
+  using offset_list = typename Types::offset_types_t;
+
+  static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
+                "Segment was already assigned: Probably looping over same "
+                "segment in loop nest");
+
+  using type = LoopTypes<
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>,
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>>;
 };
 
 
-template<typename Types, camp::idx_t Segment, typename T>
-using setSegmentType =
-    typename SetSegmentTypeHelper<Types, Segment, T, camp::make_idx_seq_t<Types::s_num_segments>>::type;
+template <typename Types, camp::idx_t Segment, typename T>
+using setSegmentType = typename SetSegmentTypeHelper<
+    Types,
+    Segment,
+    T,
+    camp::make_idx_seq_t<Types::s_num_segments>>::type;
 
-template<typename Types, camp::idx_t Segment, typename Data>
-using setSegmentTypeFromData =
-    setSegmentType<Types, Segment, camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
+template <typename Types, camp::idx_t Segment, typename Data>
+using setSegmentTypeFromData = setSegmentType<
+    Types,
+    Segment,
+    camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
 
 
 }  // end namespace internal
diff --git a/include/RAJA/pattern/kernel/internal/Statement.hpp b/include/RAJA/pattern/kernel/internal/Statement.hpp
index 48ca828a68..e9dc91437f 100644
--- a/include/RAJA/pattern/kernel/internal/Statement.hpp
+++ b/include/RAJA/pattern/kernel/internal/Statement.hpp
@@ -18,9 +18,10 @@
 #ifndef RAJA_pattern_kernel_internal_Statement_HPP
 #define RAJA_pattern_kernel_internal_Statement_HPP
 
-#include "RAJA/pattern/kernel/internal/StatementList.hpp"
-#include <type_traits>
 #include <camp/camp.hpp>
+#include <type_traits>
+
+#include "RAJA/pattern/kernel/internal/StatementList.hpp"
 
 namespace RAJA
 {
@@ -28,11 +29,12 @@ namespace internal
 {
 
 
-
 template <typename ExecPolicy, typename... EnclosedStmts>
 struct Statement {
-  static_assert(std::is_same<ExecPolicy, camp::nil>::value || sizeof...(EnclosedStmts) > 0,
-      "Executable statement with no enclosed statements, this is almost certainly a bug");
+  static_assert(std::is_same<ExecPolicy, camp::nil>::value ||
+                    sizeof...(EnclosedStmts) > 0,
+                "Executable statement with no enclosed statements, this is "
+                "almost certainly a bug");
   Statement() = delete;
 
   using enclosed_statements_t = StatementList<EnclosedStmts...>;
@@ -40,13 +42,10 @@ struct Statement {
 };
 
 
-
-
 template <typename Policy, typename Types>
 struct StatementExecutor;
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/StatementList.hpp b/include/RAJA/pattern/kernel/internal/StatementList.hpp
index 5c0d71afb4..2055a94674 100644
--- a/include/RAJA/pattern/kernel/internal/StatementList.hpp
+++ b/include/RAJA/pattern/kernel/internal/StatementList.hpp
@@ -18,12 +18,12 @@
 #ifndef RAJA_pattern_kernel_internal_StatementList_HPP
 #define RAJA_pattern_kernel_internal_StatementList_HPP
 
+#include <type_traits>
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/macros.hpp"
 #include "camp/camp.hpp"
 
-#include <type_traits>
-
 namespace RAJA
 {
 namespace internal
@@ -35,8 +35,6 @@ template <typename Policy, typename Types>
 struct StatementExecutor;
 
 
-
-
 template <typename... Stmts>
 using StatementList = camp::list<Stmts...>;
 
@@ -47,7 +45,8 @@ struct StatementListExecutor;
 
 template <camp::idx_t statement_index,
           camp::idx_t num_statements,
-          typename StmtList, typename Types>
+          typename StmtList,
+          typename Types>
 struct StatementListExecutor {
 
   template <typename Data>
@@ -61,8 +60,10 @@ struct StatementListExecutor {
     StatementExecutor<statement, Types>::exec(std::forward<Data>(data));
 
     // call our next statement
-    StatementListExecutor<statement_index + 1, num_statements, StmtList, Types>::exec(
-        std::forward<Data>(data));
+    StatementListExecutor<statement_index + 1,
+                          num_statements,
+                          StmtList,
+                          Types>::exec(std::forward<Data>(data));
   }
 };
 
@@ -89,7 +90,6 @@ RAJA_INLINE void execute_statement_list(Data &&data)
 }
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/Template.hpp b/include/RAJA/pattern/kernel/internal/Template.hpp
index c750b95986..b7ac5c864f 100644
--- a/include/RAJA/pattern/kernel/internal/Template.hpp
+++ b/include/RAJA/pattern/kernel/internal/Template.hpp
@@ -31,17 +31,15 @@ namespace detail
 // Helper class to convert a camp::idx_t into some type T
 // used in template expansion in ListOfNHelper
 template <typename T, camp::idx_t>
-struct SeqToType
-{
+struct SeqToType {
   using type = T;
 };
 
 template <typename T, typename SEQ>
 struct ListOfNHelper;
 
-template <typename T, camp::idx_t ... SEQ>
-struct ListOfNHelper<T, camp::idx_seq<SEQ...> >
-{
+template <typename T, camp::idx_t... SEQ>
+struct ListOfNHelper<T, camp::idx_seq<SEQ...>> {
   using type = camp::list<typename SeqToType<T, SEQ>::type...>;
 };
 
@@ -49,13 +47,12 @@ struct ListOfNHelper<T, camp::idx_seq<SEQ...> >
 template <typename T, typename SEQ>
 struct TupleOfNHelper;
 
-template <typename T, camp::idx_t ... SEQ>
-struct TupleOfNHelper<T, camp::idx_seq<SEQ...> >
-{
+template <typename T, camp::idx_t... SEQ>
+struct TupleOfNHelper<T, camp::idx_seq<SEQ...>> {
   using type = camp::tuple<typename SeqToType<T, SEQ>::type...>;
 };
 
-} // namespace detail
+}  // namespace detail
 
 /*
  *  This creates a camp::list with N types, each one being T.
@@ -64,7 +61,8 @@ struct TupleOfNHelper<T, camp::idx_seq<SEQ...> >
  *
  */
 template <typename T, camp::idx_t N>
-using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
+using list_of_n =
+    typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
 /*
@@ -74,8 +72,8 @@ using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::ty
  *
  */
 template <typename T, camp::idx_t N>
-using tuple_of_n = typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
-
+using tuple_of_n =
+    typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
 }  // end namespace internal
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index ff10f04dae..f67073c25b 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -28,7 +28,7 @@
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-//Odd dependecy with atomics is breaking CI builds
+// Odd dependecy with atomics is breaking CI builds
 //#include "RAJA/util/View.hpp"
 
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
@@ -41,7 +41,7 @@ namespace RAJA
 {
 
 // GPU or CPU threads available
-//strongly type the ExecPlace (guards agaist errors)
+// strongly type the ExecPlace (guards agaist errors)
 enum struct ExecPlace : int { HOST, DEVICE, NUM_PLACES };
 
 struct null_launch_t {
@@ -138,8 +138,12 @@ struct LaunchParams {
   RAJA_INLINE
   LaunchParams() = default;
 
-  LaunchParams(Teams in_teams, Threads in_threads, size_t in_shared_mem_size = 0)
-    : teams(in_teams), threads(in_threads), shared_mem_size(in_shared_mem_size) {};
+  LaunchParams(Teams in_teams,
+               Threads in_threads,
+               size_t in_shared_mem_size = 0)
+      : teams(in_teams),
+        threads(in_threads),
+        shared_mem_size(in_shared_mem_size){};
 
 private:
   RAJA_HOST_DEVICE
@@ -154,9 +158,8 @@ struct LaunchParams {
 class LaunchContext
 {
 public:
-
-  //Bump style allocator used to
-  //get memory from the pool
+  // Bump style allocator used to
+  // get memory from the pool
   size_t shared_mem_offset;
 
   void *shared_mem_ptr;
@@ -166,39 +169,41 @@ class LaunchContext
 #endif
 
   RAJA_HOST_DEVICE LaunchContext()
-    : shared_mem_offset(0), shared_mem_ptr(nullptr)
+      : shared_mem_offset(0), shared_mem_ptr(nullptr)
   {
   }
 
-  //TODO handle alignment
-  template<typename T>
-  RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)
+  // TODO handle alignment
+  template <typename T>
+  RAJA_HOST_DEVICE T *getSharedMemory(size_t bytes)
   {
 
-    //Calculate offset in bytes with a char pointer
-    void* mem_ptr = static_cast<char *>(shared_mem_ptr) + shared_mem_offset;
+    // Calculate offset in bytes with a char pointer
+    void *mem_ptr = static_cast<char *>(shared_mem_ptr) + shared_mem_offset;
 
-    shared_mem_offset += bytes*sizeof(T);
+    shared_mem_offset += bytes * sizeof(T);
 
-    //convert to desired type
-    return static_cast<T*>(mem_ptr);
+    // convert to desired type
+    return static_cast<T *>(mem_ptr);
   }
 
   /*
   //Odd dependecy with atomics is breaking CI builds
-  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t z_stride=DIM-1, typename arg, typename... args>
-  RAJA_HOST_DEVICE auto getSharedMemoryView(size_t bytes, arg idx, args... idxs)
+  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t
+  z_stride=DIM-1, typename arg, typename... args> RAJA_HOST_DEVICE auto
+  getSharedMemoryView(size_t bytes, arg idx, args... idxs)
   {
     T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
 
     shared_mem_offset += bytes*sizeof(T);
-    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx, idxs...);
+    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx,
+  idxs...);
   }
   */
 
   RAJA_HOST_DEVICE void releaseSharedMemory()
   {
-    //On the cpu/gpu we want to restart the count
+    // On the cpu/gpu we want to restart the count
     shared_mem_offset = 0;
   }
 
@@ -218,19 +223,24 @@ class LaunchContext
 template <typename LAUNCH_POLICY>
 struct LaunchExecute;
 
-//Policy based launch with support to new reducers...
-template <typename LAUNCH_POLICY, typename ... ReduceParams>
-void launch(LaunchParams const &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+// Policy based launch with support to new reducers...
+template <typename LAUNCH_POLICY, typename... ReduceParams>
+void launch(LaunchParams const &launch_params,
+            const char *kernel_name,
+            ReduceParams &&...rest_of_launch_args)
 {
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto &&launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  //Take the first policy as we assume the second policy is not user defined.
-  //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  // Take the first policy as we assume the second policy is not user defined.
+  // We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context{
+      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -242,29 +252,36 @@ void launch(LaunchParams const &launch_params, const char *kernel_name, ReducePa
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<
+      typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(
+      Res::get_default(), launch_params, kernel_name, p_body, reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
 
-//Duplicate of code above on account that we need to support the case in which a kernel_name is not given
-template <typename LAUNCH_POLICY, typename ... ReduceParams>
-void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_args)
+// Duplicate of code above on account that we need to support the case in which
+// a kernel_name is not given
+template <typename LAUNCH_POLICY, typename... ReduceParams>
+void launch(LaunchParams const &launch_params,
+            ReduceParams &&...rest_of_launch_args)
 {
 
   const char *kernel_name = nullptr;
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto &&launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  //Take the first policy as we assume the second policy is not user defined.
-  //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  // Take the first policy as we assume the second policy is not user defined.
+  // We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context{
+      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -276,15 +293,17 @@ void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<
+      typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(
+      Res::get_default(), launch_params, kernel_name, p_body, reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
 //=================================================
-//Run time based policy launch
+// Run time based policy launch
 //=================================================
 template <typename POLICY_LIST, typename BODY>
 void launch(ExecPlace place, LaunchParams const &params, BODY const &body)
@@ -293,131 +312,174 @@ void launch(ExecPlace place, LaunchParams const &params, BODY const &body)
 }
 
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name, BODY const &body)
+void launch(ExecPlace place,
+            const LaunchParams &params,
+            const char *kernel_name,
+            BODY const &body)
 {
 
-  //Forward to single policy launch API - simplifies testing of plugins
+  // Forward to single policy launch API - simplifies testing of plugins
   switch (place) {
     case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(Res::get_default(), params, kernel_name, body);
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::host_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+          Res::get_default(), params, kernel_name, body);
       break;
     }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(Res::get_default(), params, kernel_name, body);
+    case ExecPlace::DEVICE: {
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::device_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+          Res::get_default(), params, kernel_name, body);
       break;
     }
 #endif
     default:
       RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
-//Run-time API for new reducer interface
+// Run-time API for new reducer interface
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place, const LaunchParams &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+void launch(ExecPlace place,
+            const LaunchParams &launch_params,
+            const char *kernel_name,
+            ReduceParams &&...rest_of_launch_args)
 {
 
-  //Forward to single policy launch API - simplifies testing of plugins
+  // Forward to single policy launch API - simplifies testing of plugins
   switch (place) {
     case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::host_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+          Res::get_default(),
+          launch_params,
+          kernel_name,
+          std::forward<ReduceParams>(rest_of_launch_args)...);
       break;
     }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+    case ExecPlace::DEVICE: {
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::device_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+          Res::get_default(),
+          launch_params,
+          kernel_name,
+          std::forward<ReduceParams>(rest_of_launch_args)...);
       break;
     }
 #endif
     default:
       RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
-//Run-time API for new reducer interface with support of the case without a new kernel name
+// Run-time API for new reducer interface with support of the case without a new
+// kernel name
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&... rest_of_launch_args)
-            //BODY const &body)
+void launch(ExecPlace place,
+            const LaunchParams &launch_params,
+            ReduceParams &&...rest_of_launch_args)
+// BODY const &body)
 {
 
   const char *kernel_name = nullptr;
 
-  //Forward to single policy launch API - simplifies testing of plugins
+  // Forward to single policy launch API - simplifies testing of plugins
   switch (place) {
     case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::host_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+          Res::get_default(),
+          launch_params,
+          kernel_name,
+          std::forward<ReduceParams>(rest_of_launch_args)...);
       break;
     }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+    case ExecPlace::DEVICE: {
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::device_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+          Res::get_default(),
+          launch_params,
+          kernel_name,
+          std::forward<ReduceParams>(rest_of_launch_args)...);
       break;
     }
 #endif
     default:
       RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
 
-// Helper function to retrieve a resource based on the run-time policy - if a device is active
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
-template<typename T, typename U>
-RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){
-  if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);}
-  else { return RAJA::resources::Resource(host_res); }
+// Helper function to retrieve a resource based on the run-time policy - if a
+// device is active
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || \
+    defined(RAJA_ENABLE_SYCL)
+template <typename T, typename U>
+RAJA::resources::Resource Get_Runtime_Resource(T host_res,
+                                               U device_res,
+                                               RAJA::ExecPlace device)
+{
+  if (device == RAJA::ExecPlace::DEVICE) {
+    return RAJA::resources::Resource(device_res);
+  } else {
+    return RAJA::resources::Resource(host_res);
+  }
 }
 #endif
 
-template<typename T>
-RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){
-  if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");}
+template <typename T>
+RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
+{
+  if (device == RAJA::ExecPlace::DEVICE) {
+    RAJA_ABORT_OR_THROW("Device is not enabled");
+  }
 
   return RAJA::resources::Resource(host_res);
 }
 
-//Launch API which takes team resource struct and supports new reducers
-template <typename POLICY_LIST, typename ... ReduceParams>
-resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+// Launch API which takes team resource struct and supports new reducers
+template <typename POLICY_LIST, typename... ReduceParams>
+resources::EventProxy<resources::Resource> launch(
+    RAJA::resources::Resource res,
+    LaunchParams const &launch_params,
+    const char *kernel_name,
+    ReduceParams &&...rest_of_launch_args)
 {
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto &&launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if(res.get_platform() == RAJA::Platform::host) {
+  if (res.get_platform() == RAJA::Platform::host) {
     place = RAJA::ExecPlace::HOST;
   } else {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  //Configure plugins
+  // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context{
+      place == ExecPlace::HOST
+          ? util::make_context<typename POLICY_LIST::host_policy_t>()
+          : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context{
+      util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -432,14 +494,16 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
   switch (place) {
     case ExecPlace::HOST: {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+      resources::EventProxy<resources::Resource> e_proxy =
+          launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }
 #if defined(RAJA_GPU_ACTIVE)
     case ExecPlace::DEVICE: {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name,  p_body, reducers);
+      resources::EventProxy<resources::Resource> e_proxy =
+          launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }
@@ -456,36 +520,42 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 }
 
 
-//Duplicate of API above on account that we need to handle the case that a kernel name is not provided
-template <typename POLICY_LIST, typename ... ReduceParams>
-resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       ReduceParams&&... rest_of_launch_args)
+// Duplicate of API above on account that we need to handle the case that a
+// kernel name is not provided
+template <typename POLICY_LIST, typename... ReduceParams>
+resources::EventProxy<resources::Resource> launch(
+    RAJA::resources::Resource res,
+    LaunchParams const &launch_params,
+    ReduceParams &&...rest_of_launch_args)
 {
 
   const char *kernel_name = nullptr;
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto &&launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if(res.get_platform() == RAJA::Platform::host) {
+  if (res.get_platform() == RAJA::Platform::host) {
     place = RAJA::ExecPlace::HOST;
   } else {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  //Configure plugins
+  // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context{
+      place == ExecPlace::HOST
+          ? util::make_context<typename POLICY_LIST::host_policy_t>()
+          : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context{
+      util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -500,14 +570,16 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
   switch (place) {
     case ExecPlace::HOST: {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+      resources::EventProxy<resources::Resource> e_proxy =
+          launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }
 #if defined(RAJA_GPU_ACTIVE)
     case ExecPlace::DEVICE: {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+      resources::EventProxy<resources::Resource> e_proxy =
+          launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }
@@ -523,7 +595,7 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
   return resources::EventProxy<resources::Resource>(res);
 }
 
-template<typename POLICY_LIST>
+template <typename POLICY_LIST>
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
 using loop_policy = typename POLICY_LIST::device_policy_t;
 #else
@@ -546,8 +618,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
                                        BODY const &body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-      segment, body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment, body);
 }
 
 template <typename POLICY_LIST,
@@ -555,12 +626,13 @@ template <typename POLICY_LIST,
           typename SEGMENT,
           typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
-                                          SEGMENT const &segment,
-                                          BODY const &body)
+                                              SEGMENT const &segment,
+                                              BODY const &body)
 {
 
   LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-      segment, body);
+                                                             segment,
+                                                             body);
 }
 
 namespace expt
@@ -578,7 +650,9 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
 {
 
   LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-      segment0, segment1, body);
+                                                       segment0,
+                                                       segment1,
+                                                       body);
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -587,13 +661,15 @@ template <typename POLICY_LIST,
           typename SEGMENT,
           typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+                                              SEGMENT const &segment0,
+                                              SEGMENT const &segment1,
+                                              BODY const &body)
 {
 
   LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-      segment0, segment1, body);
+                                                             segment0,
+                                                             segment1,
+                                                             body);
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -608,11 +684,8 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
                                        BODY const &body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment0,
-                                                       segment1,
-                                                       segment2,
-                                                       body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, segment0, segment1, segment2, body);
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -621,17 +694,17 @@ template <typename POLICY_LIST,
           typename SEGMENT,
           typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       SEGMENT const &segment2,
-                                       BODY const &body)
+                                              SEGMENT const &segment0,
+                                              SEGMENT const &segment1,
+                                              SEGMENT const &segment2,
+                                              BODY const &body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-      segment0, segment1, segment2, body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, segment0, segment1, segment2, body);
 }
 
-} //namespace expt
+}  // namespace expt
 
 template <typename POLICY, typename SEGMENT>
 struct TileExecute;
@@ -651,7 +724,9 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
 {
 
   TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-      tile_size, segment, body);
+                                                       tile_size,
+                                                       segment,
+                                                       body);
 }
 
 template <typename POLICY_LIST,
@@ -660,12 +735,14 @@ template <typename POLICY_LIST,
           typename SEGMENT,
           typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
-                                       TILE_T tile_size,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+                                              TILE_T tile_size,
+                                              SEGMENT const &segment,
+                                              BODY const &body)
 {
   TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-      tile_size, segment, body);
+                                                             tile_size,
+                                                             segment,
+                                                             body);
 }
 
 namespace expt
@@ -684,8 +761,8 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
                                        BODY const &body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-      tile_size0, tile_size1, segment0, segment1, body);
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
 template <typename POLICY_LIST,
@@ -694,15 +771,15 @@ template <typename POLICY_LIST,
           typename SEGMENT,
           typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
-                                       TILE_T tile_size0,
-                                       TILE_T tile_size1,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+                                              TILE_T tile_size0,
+                                              TILE_T tile_size1,
+                                              SEGMENT const &segment0,
+                                              SEGMENT const &segment1,
+                                              BODY const &body)
 {
 
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-      tile_size0, tile_size1, segment0, segment1, body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
 template <typename POLICY_LIST,
@@ -721,8 +798,13 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
 {
 
   TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-      tile_size0, tile_size1, tile_size2,
-      segment0, segment1, segment2, body);
+                                                       tile_size0,
+                                                       tile_size1,
+                                                       tile_size2,
+                                                       segment0,
+                                                       segment1,
+                                                       segment2,
+                                                       body);
 }
 
 template <typename POLICY_LIST,
@@ -731,21 +813,26 @@ template <typename POLICY_LIST,
           typename SEGMENT,
           typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
-                                       TILE_T tile_size0,
-                                       TILE_T tile_size1,
-                                       TILE_T tile_size2,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       SEGMENT const &segment2,
-                                       BODY const &body)
+                                              TILE_T tile_size0,
+                                              TILE_T tile_size1,
+                                              TILE_T tile_size2,
+                                              SEGMENT const &segment0,
+                                              SEGMENT const &segment1,
+                                              SEGMENT const &segment2,
+                                              BODY const &body)
 {
 
   TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-      tile_size0, tile_size1, tile_size2,
-      segment0, segment1, segment2, body);
+                                                             tile_size0,
+                                                             tile_size1,
+                                                             tile_size2,
+                                                             segment0,
+                                                             segment1,
+                                                             segment2,
+                                                             body);
 }
 
-} //namespace expt
+}  // namespace expt
 
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/pattern/multi_reduce.hpp b/include/RAJA/pattern/multi_reduce.hpp
index 3fbe36877c..65e32c9656 100644
--- a/include/RAJA/pattern/multi_reduce.hpp
+++ b/include/RAJA/pattern/multi_reduce.hpp
@@ -19,7 +19,6 @@
 #define RAJA_multi_reduce_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/macros.hpp"
 
@@ -156,7 +155,7 @@ struct MultiReduceSum;
  */
 template <typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitOr;
- 
+
 
 /*!
  ******************************************************************************
@@ -171,7 +170,8 @@ struct MultiReduceBitOr;
    Index_ptr bins = ...;
    Real_ptr bit_vals = ...;
 
-   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins, init_val);
+   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins,
+ init_val);
 
    forall<exec_policy>( ..., [=] (Index_type i) {
       my_bits[bins[i]] &= (data[i]);
@@ -188,7 +188,7 @@ struct MultiReduceBitOr;
 template <typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitAnd;
 
-} //namespace RAJA
+}  // namespace RAJA
 
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index 5a656206f5..eefcdf9652 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -1,19 +1,18 @@
 #ifndef FORALL_PARAM_HPP
 #define FORALL_PARAM_HPP
 
-#include "RAJA/policy/sequential/params/reduce.hpp"
-#include "RAJA/policy/sequential/params/kernel_name.hpp"
-#include "RAJA/policy/openmp/params/reduce.hpp"
-#include "RAJA/policy/openmp/params/kernel_name.hpp"
-#include "RAJA/policy/openmp_target/params/reduce.hpp"
-#include "RAJA/policy/openmp_target/params/kernel_name.hpp"
-#include "RAJA/policy/cuda/params/reduce.hpp"
 #include "RAJA/policy/cuda/params/kernel_name.hpp"
-#include "RAJA/policy/hip/params/reduce.hpp"
+#include "RAJA/policy/cuda/params/reduce.hpp"
 #include "RAJA/policy/hip/params/kernel_name.hpp"
-#include "RAJA/policy/sycl/params/reduce.hpp"
+#include "RAJA/policy/hip/params/reduce.hpp"
+#include "RAJA/policy/openmp/params/kernel_name.hpp"
+#include "RAJA/policy/openmp/params/reduce.hpp"
+#include "RAJA/policy/openmp_target/params/kernel_name.hpp"
+#include "RAJA/policy/openmp_target/params/reduce.hpp"
+#include "RAJA/policy/sequential/params/kernel_name.hpp"
+#include "RAJA/policy/sequential/params/reduce.hpp"
 #include "RAJA/policy/sycl/params/kernel_name.hpp"
-
+#include "RAJA/policy/sycl/params/reduce.hpp"
 #include "RAJA/util/CombiningAdapter.hpp"
 
 namespace RAJA
@@ -21,348 +20,443 @@ namespace RAJA
 namespace expt
 {
 
-  //
-  //
-  // Forall Parameter Packing type
-  //
-  //
-  struct ParamMultiplexer;
-
-  template<typename... Params>
-  struct ForallParamPack {
-
-    friend struct ParamMultiplexer;
-
-    using Base = camp::tuple<Params...>;
-    Base param_tup;
-
-    static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value; 
-    using params_seq = camp::make_idx_seq_t< param_tup_sz >;
-
-  private:
-
-    // Init
-    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
-    static constexpr void detail_init(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
-      CAMP_EXPAND(expt::detail::init<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
-    }
-
-    // Combine
-    template<typename EXEC_POL, camp::idx_t... Seq>
-    RAJA_HOST_DEVICE
-    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& out, const ForallParamPack& in ) {
-      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(out.param_tup), camp::get<Seq>(in.param_tup)));
-    }
-
-    template<typename EXEC_POL, camp::idx_t... Seq>
-    RAJA_HOST_DEVICE
-    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params ) {
-      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(f_params.param_tup) ));
-    }
-    
-    // Resolve
-    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
-    static constexpr void detail_resolve(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
-      CAMP_EXPAND(detail::resolve<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
-    }
-
-    // Used to construct the argument TYPES that will be invoked with the lambda.
-    template<typename null_t = camp::nil>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple<>{}; };
-    template<typename null_t = camp::nil, typename First>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return typename First::ARG_TUP_T(); };
-    template<typename null_t = camp::nil, typename First, typename Second, typename... Rest>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple_cat_pair(typename First::ARG_TUP_T(), LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>()); };
-
-    using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
-    
-    //Use the size of param_tup to generate the argument list.
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) { return camp::make_tuple(); }
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) { return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup(); }
-    template<camp::idx_t N>
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>) {
-      return camp::tuple_cat_pair(  camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(), LAMBDA_ARG_TUP_V(camp::num<N-1>())  );
-    }
-
-  public:
-    ForallParamPack(){}
-
-    RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args() {return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());}
-
-    using lambda_arg_seq = camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
-
-    template<typename... Ts>
-    ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
-  }; // struct ForallParamPack 
-  
-
-
-  //===========================================================================
-  //
-  //
-  // ParamMultiplexer is how we hook into the individual calls within forall_impl.
-  //
-  //
-  struct ParamMultiplexer {
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr init( ForallParamPack<Params...>& f_params, Args&& ...args) {
-      FP::detail_init(EXEC_POL(),typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr combine(ForallParamPack<Params...>& f_params, Args&& ...args){
-      FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr resolve( ForallParamPack<Params...>& f_params, Args&& ...args){
-      FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-  };
-  //===========================================================================
+//
+//
+// Forall Parameter Packing type
+//
+//
+struct ParamMultiplexer;
 
+template <typename... Params>
+struct ForallParamPack {
 
+  friend struct ParamMultiplexer;
 
-  //===========================================================================
-  //
-  //
-  // ForallParamPack generators.
-  //
-  //
-  RAJA_INLINE static auto get_empty_forall_param_pack(){
-    static ForallParamPack<> p;
-    return p;
-  }
+  using Base = camp::tuple<Params...>;
+  Base param_tup;
 
-  namespace detail {
-    // all_true trick to perform variadic expansion in static asserts.
-    // https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
-    template<bool...> struct bool_pack;
-    template<bool... bs>
-    using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+  static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value;
+  using params_seq = camp::make_idx_seq_t<param_tup_sz>;
 
-    template<typename Base, typename... Ts>
-    using check_types_derive_base = all_true<std::is_convertible<Ts, Base>::value...>;
-  } // namespace detail
+private:
+  // Init
+  template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
+  static constexpr void detail_init(EXEC_POL,
+                                    camp::idx_seq<Seq...>,
+                                    ForallParamPack& f_params,
+                                    Args&&... args)
+  {
+    CAMP_EXPAND(expt::detail::init<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                             std::forward<Args>(args)...));
+  }
 
+  // Combine
+  template <typename EXEC_POL, camp::idx_t... Seq>
+  RAJA_HOST_DEVICE static constexpr void detail_combine(
+      EXEC_POL,
+      camp::idx_seq<Seq...>,
+      ForallParamPack& out,
+      const ForallParamPack& in)
+  {
+    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(out.param_tup),
+                                          camp::get<Seq>(in.param_tup)));
+  }
+
+  template <typename EXEC_POL, camp::idx_t... Seq>
+  RAJA_HOST_DEVICE static constexpr void detail_combine(
+      EXEC_POL,
+      camp::idx_seq<Seq...>,
+      ForallParamPack& f_params)
+  {
+    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(f_params.param_tup)));
+  }
 
-  template<typename... Ts>
-  constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple) {
-    static_assert(detail::check_types_derive_base<detail::ForallParamBase, camp::decay<Ts>...>::value,
-        "Forall optional arguments do not derive ForallParamBase. Please see Reducer, ReducerLoc and KernelName for examples.") ;
-    return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
+  // Resolve
+  template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
+  static constexpr void detail_resolve(EXEC_POL,
+                                       camp::idx_seq<Seq...>,
+                                       ForallParamPack& f_params,
+                                       Args&&... args)
+  {
+    CAMP_EXPAND(detail::resolve<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                          std::forward<Args>(args)...));
   }
 
-  
+  // Used to construct the argument TYPES that will be invoked with the lambda.
+  template <typename null_t = camp::nil>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return camp::tuple<>{};
+  };
+  template <typename null_t = camp::nil, typename First>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return typename First::ARG_TUP_T();
+  };
+  template <typename null_t = camp::nil,
+            typename First,
+            typename Second,
+            typename... Rest>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return camp::tuple_cat_pair(typename First::ARG_TUP_T(),
+                                LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>());
+  };
 
-  namespace detail {
-    // Maybe we should do a lot of these with structs...
-    template<camp::idx_t... Seq, typename TupleType>
-    constexpr auto tuple_from_seq (const camp::idx_seq<Seq...>&, TupleType&& tuple){
-      return camp::forward_as_tuple( camp::get< Seq >(std::forward<TupleType>(tuple))... );
-    };
+  using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
 
-    template<typename... Ts>
-    constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple){
-      return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts)-1>{},std::move(tuple));
-    };
-  } // namespace detail
+  // Use the size of param_tup to generate the argument list.
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>)
+  {
+    return camp::make_tuple();
+  }
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>)
+  {
+    return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup();
+  }
+  template <camp::idx_t N>
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>)
+  {
+    return camp::tuple_cat_pair(
+        camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(),
+        LAMBDA_ARG_TUP_V(camp::num<N - 1>()));
+  }
 
+public:
+  ForallParamPack() {}
 
-  // Make a tuple of the param pack except the final element...
-  template<typename... Args>
-  constexpr auto make_forall_param_pack(Args&&... args){
-    // We assume the last element of the pack is the lambda so we need to strip it from the list.
-    auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward<Args>(args)...) ); 
-    return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+  RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args()
+  {
+    return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());
   }
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Callable should be the last argument in the param pack, just extract it...
-  //
-  //
-  template<typename... Args>
-  constexpr auto&& get_lambda(Args&&... args){
-    return camp::get<sizeof...(Args)-1>( camp::forward_as_tuple(std::forward<Args>(args)...) );
-  } 
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Checking expected argument list against the assumed lambda.
-  //
-  //
-  namespace detail {
-
-    // 
-    //
-    // Lambda traits Utilities
-    // 
-    //
-    template<class F>
-    struct lambda_traits;
-
-    template<class R, class C, class First, class... Rest>
-    struct lambda_traits<R (C::*)(First, Rest...)>
-    {  // non-const specialization
-      using arg_type = First; 
-    };
-    template<class R, class C, class First, class... Rest>
-    struct lambda_traits<R (C::*)(First, Rest...) const>
-    {  // const specialization
-      using arg_type = First; 
-    };
-
-    template<class T>
-    typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
-
-
-    // 
-    //
-    // List manipulation Utilities
-    // 
-    //
-    template<typename... Ts>
-    constexpr auto list_remove_pointer(const camp::list<Ts...>&){
-      return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...>{};
-    }
-    
-    template<typename... Ts>
-    constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&){
-      return camp::list<typename std::add_lvalue_reference<Ts>::type...>{};
-    }
-
-    template<typename... Ts>
-    constexpr auto tuple_to_list(const camp::tuple<Ts...>&) {
-      return camp::list<Ts...>{};
-    }
-
-    // TODO : Change to std::is_invocable at c++17
-    template <typename F, typename... Args>
-    struct is_invocable :
-      std::is_constructible<
-        std::function<void(Args ...)>,
-        std::reference_wrapper<typename std::remove_reference<F>::type>
-      >{};
-
-    template<class...>
-    using void_t = void;
-
-    template<class F, class=void>
-    struct has_empty_op : std::false_type{};
-
-    template<class F>
-    struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>> : std::true_type{};
-
-    template<class F>
-    struct get_lambda_index_type {
-      typedef typename std::remove_pointer<
-                decltype(lambda_arg_helper(
-                      &camp::decay<F>::operator())
-                )
-              >::type type;
-    };
-
-    // If LAMBDA::operator() is not available this probably isn't a generic lambda and we can't extract and check args.
-    template<typename LAMBDA, typename... EXPECTED_ARGS>
-    constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {}
-
-    template<typename LAMBDA, typename... EXPECTED_ARGS>
-    constexpr concepts::enable_if<has_empty_op<LAMBDA>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {
-#if !defined(RAJA_ENABLE_HIP)
-      static_assert(is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match between RAJA::expt::Reduce() and ValOp arguments."); 
-#endif
-    }
 
-  } // namespace detail
+  using lambda_arg_seq =
+      camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
+
+  template <typename... Ts>
+  ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)){};
+};  // struct ForallParamPack
+
+
+//===========================================================================
+//
+//
+// ParamMultiplexer is how we hook into the individual calls within forall_impl.
+//
+//
+struct ParamMultiplexer {
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr init(ForallParamPack<Params...>& f_params,
+                             Args&&... args)
+  {
+    FP::detail_init(EXEC_POL(),
+                    typename FP::params_seq(),
+                    f_params,
+                    std::forward<Args>(args)...);
+  }
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr combine(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
+  {
+    FP::detail_combine(EXEC_POL(),
+                       typename FP::params_seq(),
+                       f_params,
+                       std::forward<Args>(args)...);
+  }
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr resolve(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
+  {
+    FP::detail_resolve(EXEC_POL(),
+                       typename FP::params_seq(),
+                       f_params,
+                       std::forward<Args>(args)...);
+  }
+};
+//===========================================================================
+
 
+//===========================================================================
+//
+//
+// ForallParamPack generators.
+//
+//
+RAJA_INLINE static auto get_empty_forall_param_pack()
+{
+  static ForallParamPack<> p;
+  return p;
+}
 
-  template<typename Lambda, typename ForallParams>
-  constexpr 
-  void
-  check_forall_optional_args(Lambda&& l, ForallParams& fpp) {
+namespace detail
+{
+// all_true trick to perform variadic expansion in static asserts.
+// https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
+template <bool...>
+struct bool_pack;
+template <bool... bs>
+using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
 
-    using expected_arg_type_list = decltype( detail::list_add_lvalue_ref(
-                                               detail::list_remove_pointer(
-                                                 detail::tuple_to_list(
-                                                   fpp.lambda_args()
-                                                 )
-                                               )
-                                            ));
+template <typename Base, typename... Ts>
+using check_types_derive_base =
+    all_true<std::is_convertible<Ts, Base>::value...>;
+}  // namespace detail
 
-    detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list{});
-  }
-  //===========================================================================
-  
 
+template <typename... Ts>
+constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple)
+{
+  static_assert(detail::check_types_derive_base<detail::ForallParamBase,
+                                                camp::decay<Ts>...>::value,
+                "Forall optional arguments do not derive ForallParamBase. "
+                "Please see Reducer, ReducerLoc and KernelName for examples.");
+  return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
+}
 
-  //===========================================================================
-  //
-  //
-  // Type trailts for SFINAE work.
-  //
-  //
-  namespace type_traits
-  {
-    template <typename T> struct is_ForallParamPack : std::false_type {};
-    template <typename... Args> struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type {};
 
-    template <typename T> struct is_ForallParamPack_empty : std::true_type {};
-    template <typename First, typename... Rest> struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>> : std::false_type {};
-    template <> struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type {};
-  }
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Invoke Forall with Params.
-  //
-  //
-  namespace detail {
-    template<camp::idx_t Idx, typename FP>
-    RAJA_HOST_DEVICE
-    constexpr
-    auto get_lambda_args(FP& fpp)
-        -> decltype(  *camp::get<Idx>( fpp.lambda_args() )  ) {
-      return (  *camp::get<Idx>( fpp.lambda_args() )  );
-    }
-
-    CAMP_SUPPRESS_HD_WARN
-    template <typename Fn,
-              camp::idx_t... Sequence,
-              typename Params,
-              typename... Ts>
-    RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
-                                                      Fn&& f,
-                                                      camp::idx_seq<Sequence...>,
-                                                      Ts&&... extra)
-    {
-      return f(std::forward<Ts...>(extra...), ( get_lambda_args<Sequence>(params) )...);
-    }
-  } // namespace detail
-
-  //CAMP_SUPPRESS_HD_WARN
-  template <typename Params, typename Fn, typename... Ts>
-  RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params, Fn&& f, Ts&&... extra)
-  {
-    return detail::invoke_with_order(
-        camp::forward<Params>(params),
-        camp::forward<Fn>(f),
-        typename camp::decay<Params>::lambda_arg_seq(),
-        camp::forward<Ts...>(extra)...);
-  }
-  //===========================================================================
+namespace detail
+{
+// Maybe we should do a lot of these with structs...
+template <camp::idx_t... Seq, typename TupleType>
+constexpr auto tuple_from_seq(const camp::idx_seq<Seq...>&, TupleType&& tuple)
+{
+  return camp::forward_as_tuple(
+      camp::get<Seq>(std::forward<TupleType>(tuple))...);
+};
+
+template <typename... Ts>
+constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple)
+{
+  return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts) - 1>{},
+                        std::move(tuple));
+};
+}  // namespace detail
 
-} //  namespace expt
-} //  namespace RAJA
 
-#endif //  FORALL_PARAM_HPP
+// Make a tuple of the param pack except the final element...
+template <typename... Args>
+constexpr auto make_forall_param_pack(Args&&... args)
+{
+  // We assume the last element of the pack is the lambda so we need to strip it
+  // from the list.
+  auto stripped_arg_tuple = detail::strip_last_elem(
+      camp::forward_as_tuple(std::forward<Args>(args)...));
+  return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+}
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Callable should be the last argument in the param pack, just extract it...
+//
+//
+template <typename... Args>
+constexpr auto&& get_lambda(Args&&... args)
+{
+  return camp::get<sizeof...(Args) - 1>(
+      camp::forward_as_tuple(std::forward<Args>(args)...));
+}
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Checking expected argument list against the assumed lambda.
+//
+//
+namespace detail
+{
+
+//
+//
+// Lambda traits Utilities
+//
+//
+template <class F>
+struct lambda_traits;
+
+template <class R, class C, class First, class... Rest>
+struct lambda_traits<R (C::*)(First, Rest...)> {  // non-const specialization
+  using arg_type = First;
+};
+template <class R, class C, class First, class... Rest>
+struct lambda_traits<R (C::*)(First, Rest...) const> {  // const specialization
+  using arg_type = First;
+};
+
+template <class T>
+typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
+
+
+//
+//
+// List manipulation Utilities
+//
+//
+template <typename... Ts>
+constexpr auto list_remove_pointer(const camp::list<Ts...>&)
+{
+  return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...>{};
+}
+
+template <typename... Ts>
+constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&)
+{
+  return camp::list<typename std::add_lvalue_reference<Ts>::type...>{};
+}
+
+template <typename... Ts>
+constexpr auto tuple_to_list(const camp::tuple<Ts...>&)
+{
+  return camp::list<Ts...>{};
+}
+
+// TODO : Change to std::is_invocable at c++17
+template <typename F, typename... Args>
+struct is_invocable
+    : std::is_constructible<
+          std::function<void(Args...)>,
+          std::reference_wrapper<typename std::remove_reference<F>::type>> {
+};
+
+template <class...>
+using void_t = void;
+
+template <class F, class = void>
+struct has_empty_op : std::false_type {
+};
+
+template <class F>
+struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>>
+    : std::true_type {
+};
+
+template <class F>
+struct get_lambda_index_type {
+  typedef typename std::remove_pointer<decltype(lambda_arg_helper(
+      &camp::decay<F>::operator()))>::type type;
+};
+
+// If LAMBDA::operator() is not available this probably isn't a generic lambda
+// and we can't extract and check args.
+template <typename LAMBDA, typename... EXPECTED_ARGS>
+constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>>
+check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
+{
+}
+
+template <typename LAMBDA, typename... EXPECTED_ARGS>
+constexpr concepts::enable_if<has_empty_op<LAMBDA>> check_invocable(
+    LAMBDA&&,
+    const camp::list<EXPECTED_ARGS...>&)
+{
+#if !defined(RAJA_ENABLE_HIP)
+  static_assert(is_invocable<LAMBDA,
+                             typename get_lambda_index_type<LAMBDA>::type,
+                             EXPECTED_ARGS...>::value,
+                "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types "
+                "must match between RAJA::expt::Reduce() and ValOp arguments.");
+#endif
+}
+
+}  // namespace detail
+
+
+template <typename Lambda, typename ForallParams>
+constexpr void check_forall_optional_args(Lambda&& l, ForallParams& fpp)
+{
+
+  using expected_arg_type_list = decltype(detail::list_add_lvalue_ref(
+      detail::list_remove_pointer(detail::tuple_to_list(fpp.lambda_args()))));
+
+  detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list{});
+}
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Type trailts for SFINAE work.
+//
+//
+namespace type_traits
+{
+template <typename T>
+struct is_ForallParamPack : std::false_type {
+};
+template <typename... Args>
+struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type {
+};
+
+template <typename T>
+struct is_ForallParamPack_empty : std::true_type {
+};
+template <typename First, typename... Rest>
+struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>>
+    : std::false_type {
+};
+template <>
+struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type {
+};
+}  // namespace type_traits
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Invoke Forall with Params.
+//
+//
+namespace detail
+{
+template <camp::idx_t Idx, typename FP>
+RAJA_HOST_DEVICE constexpr auto get_lambda_args(FP& fpp)
+    -> decltype(*camp::get<Idx>(fpp.lambda_args()))
+{
+  return (*camp::get<Idx>(fpp.lambda_args()));
+}
+
+CAMP_SUPPRESS_HD_WARN
+template <typename Fn, camp::idx_t... Sequence, typename Params, typename... Ts>
+RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
+                                                  Fn&& f,
+                                                  camp::idx_seq<Sequence...>,
+                                                  Ts&&... extra)
+{
+  return f(std::forward<Ts...>(extra...),
+           (get_lambda_args<Sequence>(params))...);
+}
+}  // namespace detail
+
+// CAMP_SUPPRESS_HD_WARN
+template <typename Params, typename Fn, typename... Ts>
+RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params,
+                                            Fn&& f,
+                                            Ts&&... extra)
+{
+  return detail::invoke_with_order(
+      camp::forward<Params>(params),
+      camp::forward<Fn>(f),
+      typename camp::decay<Params>::lambda_arg_seq(),
+      camp::forward<Ts...>(extra)...);
+}
+//===========================================================================
+
+}  //  namespace expt
+}  //  namespace RAJA
+
+#endif  //  FORALL_PARAM_HPP
diff --git a/include/RAJA/pattern/params/kernel_name.hpp b/include/RAJA/pattern/params/kernel_name.hpp
index e768d8dd59..5b69c89a71 100644
--- a/include/RAJA/pattern/params/kernel_name.hpp
+++ b/include/RAJA/pattern/params/kernel_name.hpp
@@ -10,23 +10,19 @@ namespace expt
 namespace detail
 {
 
-  struct KernelName : public ForallParamBase {
-    RAJA_HOST_DEVICE KernelName() {}
-    KernelName(const char* name_in) : name(name_in) {}
-    const char* name;
-  };
+struct KernelName : public ForallParamBase {
+  RAJA_HOST_DEVICE KernelName() {}
+  KernelName(const char* name_in) : name(name_in) {}
+  const char* name;
+};
 
-} // namespace detail
-
-inline auto KernelName(const char * n)
-{
-  return detail::KernelName(n);
-}
-} // namespace expt
+}  // namespace detail
 
+inline auto KernelName(const char* n) { return detail::KernelName(n); }
+}  // namespace expt
 
-} //  namespace RAJA
 
+}  //  namespace RAJA
 
 
-#endif // KERNEL_NAME_HPP
+#endif  // KERNEL_NAME_HPP
diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp
index 98380f6ffc..debfd66282 100644
--- a/include/RAJA/pattern/params/params_base.hpp
+++ b/include/RAJA/pattern/params/params_base.hpp
@@ -7,129 +7,253 @@ namespace RAJA
 namespace expt
 {
 
-  template<typename T, typename IndexType = RAJA::Index_type>
-  struct ValLoc {
-    using index_type = IndexType;
-    using value_type = T;
-
-    ValLoc() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValLoc(value_type v) : val(v) {}
-    RAJA_HOST_DEVICE constexpr ValLoc(value_type v, index_type l) : val(v), loc(l) {}
-
-    ValLoc(ValLoc const &) = default;
-    ValLoc(ValLoc &&) = default;
-    ValLoc& operator=(ValLoc const &) = default;
-    ValLoc& operator=(ValLoc &&) = default;
-
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValLoc& rhs) const { return val < rhs.val; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValLoc& rhs) const { return val > rhs.val; }
-
-    RAJA_HOST_DEVICE constexpr const value_type& getVal() const {return val;}
-    RAJA_HOST_DEVICE constexpr const index_type& getLoc() const {return loc;}
-
-    RAJA_HOST_DEVICE void set(T inval, IndexType inindex) {val = inval; loc = inindex;}
-    RAJA_HOST_DEVICE void setVal(T inval) {val = inval;}
-    RAJA_HOST_DEVICE void setLoc(IndexType inindex) {loc = inindex;}
-
-    value_type val;
-    index_type loc = -1;
-  };
-
-  template<typename T, template <typename, typename, typename> class Op>
-  struct ValOp {
-    using value_type = T;
-    using op_type = Op<T,T,T>;
-
-    ValOp() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
-
-    ValOp(ValOp const &) = default;
-    ValOp(ValOp &&) = default;
-    ValOp& operator=(ValOp const &) = default;
-    ValOp& operator=(ValOp &&) = default;
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; }
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::plus<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator+=(const value_type& rhs) { val += rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_and<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator&=(const value_type& rhs) { val &= rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_or<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator|=(const value_type& rhs) { val |= rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_and<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE ValOp & operator&=(value_type& rhs) { val &= rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_or<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE ValOp & operator|=(value_type& rhs) { val |= rhs; return *this; }
-
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { val < rhs.val; return *this; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { val > rhs.val; return *this; }
-
-    value_type val = op_type::identity();
-  };
-
-  template<typename T, typename IndexType, template <typename, typename, typename> class Op>
-  struct ValOp <ValLoc<T,IndexType>, Op> {
-    using index_type = IndexType;
-    using value_type = ValLoc<T,index_type>;
-    using op_type = Op<value_type,value_type,value_type>;
-    using valloc_value_type = typename value_type::value_type;
-    using valloc_index_type = typename value_type::index_type;
-
-    ValOp() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
-    RAJA_HOST_DEVICE constexpr ValOp(valloc_value_type v, valloc_index_type l) : val(v, l) {}
-
-    ValOp(ValOp const &) = default;
-    ValOp(ValOp &&) = default;
-    ValOp& operator=(ValOp const &) = default;
-    ValOp& operator=(ValOp &&) = default;
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & minloc(valloc_value_type v, valloc_index_type l) { return min(value_type(v,l)); }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & maxloc(valloc_value_type v, valloc_index_type l) { return max(value_type(v,l)); }
-
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { return val < rhs.val; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { return val > rhs.val; }
-
-    value_type val = op_type::identity();
-  };
-
-  template<typename T, typename IndexType, template <typename, typename, typename> class Op>
-  using ValLocOp = ValOp<ValLoc<T, IndexType>, Op>;
+template <typename T, typename IndexType = RAJA::Index_type>
+struct ValLoc {
+  using index_type = IndexType;
+  using value_type = T;
+
+  ValLoc() = default;
+  RAJA_HOST_DEVICE constexpr explicit ValLoc(value_type v) : val(v) {}
+  RAJA_HOST_DEVICE constexpr ValLoc(value_type v, index_type l) : val(v), loc(l)
+  {
+  }
+
+  ValLoc(ValLoc const&) = default;
+  ValLoc(ValLoc&&) = default;
+  ValLoc& operator=(ValLoc const&) = default;
+  ValLoc& operator=(ValLoc&&) = default;
+
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValLoc& rhs) const
+  {
+    return val < rhs.val;
+  }
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValLoc& rhs) const
+  {
+    return val > rhs.val;
+  }
+
+  RAJA_HOST_DEVICE constexpr const value_type& getVal() const { return val; }
+  RAJA_HOST_DEVICE constexpr const index_type& getLoc() const { return loc; }
+
+  RAJA_HOST_DEVICE void set(T inval, IndexType inindex)
+  {
+    val = inval;
+    loc = inindex;
+  }
+  RAJA_HOST_DEVICE void setVal(T inval) { val = inval; }
+  RAJA_HOST_DEVICE void setLoc(IndexType inindex) { loc = inindex; }
+
+  value_type val;
+  index_type loc = -1;
+};
+
+template <typename T, template <typename, typename, typename> class Op>
+struct ValOp {
+  using value_type = T;
+  using op_type = Op<T, T, T>;
+
+  ValOp() = default;
+  RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
+
+  ValOp(ValOp const&) = default;
+  ValOp(ValOp&&) = default;
+  ValOp& operator=(ValOp const&) = default;
+  ValOp& operator=(ValOp&&) = default;
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::minimum<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& min(value_type v)
+  {
+    if (v < val) {
+      val = v;
+    }
+    return *this;
+  }
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::maximum<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& max(value_type v)
+  {
+    if (v > val) {
+      val = v;
+    }
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::plus<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator+=(const value_type& rhs)
+  {
+    val += rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_and<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator&=(const value_type& rhs)
+  {
+    val &= rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_or<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator|=(const value_type& rhs)
+  {
+    val |= rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_and<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE ValOp& operator&=(value_type& rhs)
+  {
+    val &= rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_or<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE ValOp& operator|=(value_type& rhs)
+  {
+    val |= rhs;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const
+  {
+    val < rhs.val;
+    return *this;
+  }
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const
+  {
+    val > rhs.val;
+    return *this;
+  }
+
+  value_type val = op_type::identity();
+};
+
+template <typename T,
+          typename IndexType,
+          template <typename, typename, typename>
+          class Op>
+struct ValOp<ValLoc<T, IndexType>, Op> {
+  using index_type = IndexType;
+  using value_type = ValLoc<T, index_type>;
+  using op_type = Op<value_type, value_type, value_type>;
+  using valloc_value_type = typename value_type::value_type;
+  using valloc_index_type = typename value_type::index_type;
+
+  ValOp() = default;
+  RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
+  RAJA_HOST_DEVICE constexpr ValOp(valloc_value_type v, valloc_index_type l)
+      : val(v, l)
+  {
+  }
+
+  ValOp(ValOp const&) = default;
+  ValOp(ValOp&&) = default;
+  ValOp& operator=(ValOp const&) = default;
+  ValOp& operator=(ValOp&&) = default;
+
+  template <typename U = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::minimum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& min(value_type v)
+  {
+    if (v < val) {
+      val = v;
+    }
+    return *this;
+  }
+
+  template <typename U = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::maximum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& max(value_type v)
+  {
+    if (v > val) {
+      val = v;
+    }
+    return *this;
+  }
+
+  template <typename U = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::minimum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& minloc(valloc_value_type v,
+                                           valloc_index_type l)
+  {
+    return min(value_type(v, l));
+  }
+
+  template <typename U = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::maximum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& maxloc(valloc_value_type v,
+                                           valloc_index_type l)
+  {
+    return max(value_type(v, l));
+  }
+
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const
+  {
+    return val < rhs.val;
+  }
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const
+  {
+    return val > rhs.val;
+  }
+
+  value_type val = op_type::identity();
+};
+
+template <typename T,
+          typename IndexType,
+          template <typename, typename, typename>
+          class Op>
+using ValLocOp = ValOp<ValLoc<T, IndexType>, Op>;
 
 namespace detail
 {
 
-  struct ForallParamBase {
+struct ForallParamBase {
 
-    // Some of this can be made virtual in c++20, for now must be defined in each child class
-    // if any arguments to the forall lambda are needed (e.g. KernelName is excluded.)
-    using ARG_TUP_T = camp::tuple<>; 
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
-  
-  };
+  // Some of this can be made virtual in c++20, for now must be defined in each
+  // child class if any arguments to the forall lambda are needed (e.g.
+  // KernelName is excluded.)
+  using ARG_TUP_T = camp::tuple<>;
+  using ARG_LIST_T = typename ARG_TUP_T::TList;
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace expt
+}  // namespace expt
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
-#endif //  RAJA_PARAMS_BASE
+#endif  //  RAJA_PARAMS_BASE
diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index 78b6d7714d..89e020ec55 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -20,19 +20,21 @@ namespace operators
 
 template <typename T, typename IndexType>
 struct limits<RAJA::expt::ValLoc<T, IndexType>> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType> min()
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType>
+  min()
   {
     return RAJA::expt::ValLoc<T, IndexType>(RAJA::operators::limits<T>::min());
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType> max()
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType>
+  max()
   {
     return RAJA::expt::ValLoc<T, IndexType>(RAJA::operators::limits<T>::max());
   }
 };
 
-} //  namespace operators
+}  //  namespace operators
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
 namespace RAJA
 {
@@ -43,159 +45,197 @@ namespace detail
 {
 
 #if defined(RAJA_CUDA_ACTIVE)
-  using device_mem_pool_t = RAJA::cuda::device_mempool_type;
+using device_mem_pool_t = RAJA::cuda::device_mempool_type;
 #elif defined(RAJA_HIP_ACTIVE)
-  using device_mem_pool_t = RAJA::hip::device_mempool_type;
+using device_mem_pool_t = RAJA::hip::device_mempool_type;
 #elif defined(RAJA_SYCL_ACTIVE)
-  using device_mem_pool_t = RAJA::sycl::device_mempool_type;
+using device_mem_pool_t = RAJA::sycl::device_mempool_type;
 #endif
 
-  //
-  //
-  // Basic Reducer
-  //
-  //
-
-  // Basic data type Reducer
-  // T must be a basic data type
-  // VOp must be ValOp<T, Op>
-  template <typename Op, typename T, typename VOp>
-  struct Reducer : public ForallParamBase {
-    using op = Op;
-    using value_type = T; // This is a basic data type
-
-    Reducer() = default;
-
-    // Basic data type constructor
-    RAJA_HOST_DEVICE Reducer(value_type *target_in) : m_valop(VOp{}), target(target_in){}
-
-    Reducer(Reducer const &) = default;
-    Reducer(Reducer &&) = default;
-    Reducer& operator=(Reducer const &) = default;
-    Reducer& operator=(Reducer &&) = default;
-
-    // Internal ValOp object that is used within RAJA::forall/launch
-    VOp m_valop = VOp{};
-
-    // Points to the user specified result variable
-    value_type *target = nullptr;
-
-    // combineTarget() performs the final op on the target data and location in resolve()
-    RAJA_HOST_DEVICE void combineTarget(value_type in)
-    {
-      value_type temp = op{}(*target, in);
-      *target = temp;
-    }
-
-    RAJA_HOST_DEVICE
-    value_type &
-    getVal() { return m_valop.val; }
-
-#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE)
-    // Device related attributes.
-    value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-    unsigned int * device_count = nullptr;
+//
+//
+// Basic Reducer
+//
+//
+
+// Basic data type Reducer
+// T must be a basic data type
+// VOp must be ValOp<T, Op>
+template <typename Op, typename T, typename VOp>
+struct Reducer : public ForallParamBase {
+  using op = Op;
+  using value_type = T;  // This is a basic data type
+
+  Reducer() = default;
+
+  // Basic data type constructor
+  RAJA_HOST_DEVICE Reducer(value_type *target_in)
+      : m_valop(VOp{}), target(target_in)
+  {
+  }
+
+  Reducer(Reducer const &) = default;
+  Reducer(Reducer &&) = default;
+  Reducer &operator=(Reducer const &) = default;
+  Reducer &operator=(Reducer &&) = default;
+
+  // Internal ValOp object that is used within RAJA::forall/launch
+  VOp m_valop = VOp{};
+
+  // Points to the user specified result variable
+  value_type *target = nullptr;
+
+  // combineTarget() performs the final op on the target data and location in
+  // resolve()
+  RAJA_HOST_DEVICE void combineTarget(value_type in)
+  {
+    value_type temp = op{}(*target, in);
+    *target = temp;
+  }
+
+  RAJA_HOST_DEVICE
+  value_type &getVal() { return m_valop.val; }
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || \
+    defined(RAJA_SYCL_ACTIVE)
+  // Device related attributes.
+  value_type *devicetarget = nullptr;
+  RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+  unsigned int *device_count = nullptr;
 #endif
 
-    // These are types and parameters extracted from this struct, and given to the forall.
-    using ARG_TUP_T = camp::tuple<VOp*>;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&m_valop); }
-
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
-  };
-
-  // Partial specialization of Reducer for ValLoc
-  // T is a deduced basic data type
-  // I is a deduced index type
-  template <typename T, typename I, template <typename, typename, typename> class Op>
-  struct Reducer<Op<ValLoc<T,I>, ValLoc<T,I>, ValLoc<T,I>>, ValLoc<T,I>, ValOp<ValLoc<T,I>, Op>> : public ForallParamBase {
-    using target_value_type = T;
-    using target_index_type = I;
-    using value_type = ValLoc<T,I>;
-    using op = Op<value_type,value_type,value_type>;
-    using VOp = ValOp<ValLoc<target_value_type,target_index_type>, Op>;
-
-    Reducer() = default;
-
-    // ValLoc constructor
-    // Note that the target_ variables point to the val and loc within the user defined target ValLoc
-    RAJA_HOST_DEVICE Reducer(value_type *target_in) : m_valop(VOp{}), target_value(&target_in->val), target_index(&target_in->loc) {}
-
-    // Dual input constructor for ReduceLoc<>(data, index) case
-    // The target_ variables point to vars defined by the user
-    RAJA_HOST_DEVICE Reducer(target_value_type *data_in, target_index_type *index_in) : m_valop(VOp{}), target_value(data_in), target_index(index_in) {}
-
-    Reducer(Reducer const &) = default;
-    Reducer(Reducer &&) = default;
-    Reducer& operator=(Reducer const &) = default;
-    Reducer& operator=(Reducer &&) = default;
-
-    // The ValLoc within m_valop is initialized with data and location values from either a ValLoc, or dual data and location values, passed into the constructor
-    VOp m_valop = VOp{};
-
-    // Points to either dual value and index defined by the user, or value and index within a ValLoc defined by the user
-    target_value_type *target_value = nullptr;
-    target_index_type *target_index = nullptr;
-
-    // combineTarget() performs the final op on the target data and location in resolve()
-    RAJA_HOST_DEVICE void combineTarget(value_type in)
-    {
-      // Create a different temp ValLoc solely for combining
-      value_type temp(*target_value, *target_index);
-      temp = op{}(temp, in);
-      *target_value = temp.val;
-      *target_index = temp.loc;
-    }
-
-    RAJA_HOST_DEVICE
-    value_type &
-    getVal() { return m_valop.val; }
-
-#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE)
-    // Device related attributes.
-    value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-    unsigned int * device_count = nullptr;
+  // These are types and parameters extracted from this struct, and given to the
+  // forall.
+  using ARG_TUP_T = camp::tuple<VOp *>;
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup()
+  {
+    return camp::make_tuple(&m_valop);
+  }
+
+  using ARG_LIST_T = typename ARG_TUP_T::TList;
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
+
+// Partial specialization of Reducer for ValLoc
+// T is a deduced basic data type
+// I is a deduced index type
+template <typename T,
+          typename I,
+          template <typename, typename, typename>
+          class Op>
+struct Reducer<Op<ValLoc<T, I>, ValLoc<T, I>, ValLoc<T, I>>,
+               ValLoc<T, I>,
+               ValOp<ValLoc<T, I>, Op>> : public ForallParamBase {
+  using target_value_type = T;
+  using target_index_type = I;
+  using value_type = ValLoc<T, I>;
+  using op = Op<value_type, value_type, value_type>;
+  using VOp = ValOp<ValLoc<target_value_type, target_index_type>, Op>;
+
+  Reducer() = default;
+
+  // ValLoc constructor
+  // Note that the target_ variables point to the val and loc within the user
+  // defined target ValLoc
+  RAJA_HOST_DEVICE Reducer(value_type *target_in)
+      : m_valop(VOp{}),
+        target_value(&target_in->val),
+        target_index(&target_in->loc)
+  {
+  }
+
+  // Dual input constructor for ReduceLoc<>(data, index) case
+  // The target_ variables point to vars defined by the user
+  RAJA_HOST_DEVICE Reducer(target_value_type *data_in,
+                           target_index_type *index_in)
+      : m_valop(VOp{}), target_value(data_in), target_index(index_in)
+  {
+  }
+
+  Reducer(Reducer const &) = default;
+  Reducer(Reducer &&) = default;
+  Reducer &operator=(Reducer const &) = default;
+  Reducer &operator=(Reducer &&) = default;
+
+  // The ValLoc within m_valop is initialized with data and location values from
+  // either a ValLoc, or dual data and location values, passed into the
+  // constructor
+  VOp m_valop = VOp{};
+
+  // Points to either dual value and index defined by the user, or value and
+  // index within a ValLoc defined by the user
+  target_value_type *target_value = nullptr;
+  target_index_type *target_index = nullptr;
+
+  // combineTarget() performs the final op on the target data and location in
+  // resolve()
+  RAJA_HOST_DEVICE void combineTarget(value_type in)
+  {
+    // Create a different temp ValLoc solely for combining
+    value_type temp(*target_value, *target_index);
+    temp = op{}(temp, in);
+    *target_value = temp.val;
+    *target_index = temp.loc;
+  }
+
+  RAJA_HOST_DEVICE
+  value_type &getVal() { return m_valop.val; }
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || \
+    defined(RAJA_SYCL_ACTIVE)
+  // Device related attributes.
+  value_type *devicetarget = nullptr;
+  RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+  unsigned int *device_count = nullptr;
 #endif
 
-    // These are types and parameters extracted from this struct, and given to the forall.
-    using ARG_TUP_T = camp::tuple<VOp*>;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&m_valop); }
+  // These are types and parameters extracted from this struct, and given to the
+  // forall.
+  using ARG_TUP_T = camp::tuple<VOp *>;
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup()
+  {
+    return camp::make_tuple(&m_valop);
+  }
 
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
-  };
+  using ARG_LIST_T = typename ARG_TUP_T::TList;
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
 
-} // namespace detail
+}  // namespace detail
 
 // Standard use case.
 template <template <typename, typename, typename> class Op, typename T>
 auto constexpr Reduce(T *target)
 {
-  return detail::Reducer<Op<T,T,T>, T, ValOp<T, Op>>(target);
+  return detail::Reducer<Op<T, T, T>, T, ValOp<T, Op>>(target);
 }
 
 // User-defined ValLoc case.
-template <template <typename, typename, typename> class Op, typename T, typename IndexType>
+template <template <typename, typename, typename> class Op,
+          typename T,
+          typename IndexType>
 auto constexpr Reduce(ValLoc<T, IndexType> *target)
 {
-  using VL = ValLoc<T,IndexType>;
-  return detail::Reducer<Op<VL,VL,VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(target);
+  using VL = ValLoc<T, IndexType>;
+  return detail::Reducer<Op<VL, VL, VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(
+      target);
 }
 
-// Dual input use case where reduction value and location are separate, non-ValLoc types supplied by the user.
-template <template <typename, typename, typename> class Op, typename T, typename IndexType>
+// Dual input use case where reduction value and location are separate,
+// non-ValLoc types supplied by the user.
+template <template <typename, typename, typename> class Op,
+          typename T,
+          typename IndexType>
 auto constexpr ReduceLoc(T *target, IndexType *index)
 {
-  using VL = ValLoc<T,IndexType>;
-  return detail::Reducer<Op<VL,VL,VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(target, index);
+  using VL = ValLoc<T, IndexType>;
+  return detail::Reducer<Op<VL, VL, VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(
+      target, index);
 }
 
-} // namespace expt
+}  // namespace expt
 
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_HPP
+#endif  //  NEW_REDUCE_HPP
diff --git a/include/RAJA/pattern/reduce.hpp b/include/RAJA/pattern/reduce.hpp
index 0c0eaf3efb..9cc781f02f 100644
--- a/include/RAJA/pattern/reduce.hpp
+++ b/include/RAJA/pattern/reduce.hpp
@@ -19,7 +19,6 @@
 #define RAJA_reduce_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/macros.hpp"
 
@@ -205,7 +204,7 @@ class ReduceSum;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceBitOr;
- 
+
 
 /*!
  ******************************************************************************
@@ -231,7 +230,7 @@ class ReduceBitOr;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceBitAnd;
-} //namespace RAJA
+}  // namespace RAJA
 
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index 0f46ee0a22..bb737d9ea8 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -18,15 +18,14 @@
 #ifndef RAJA_scan_HPP
 #define RAJA_scan_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iterator>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
+#include "RAJA/pattern/detail/algorithm.hpp"
 #include "RAJA/policy/PolicyBase.hpp"
-#include "RAJA/util/concepts.hpp"
 #include "RAJA/util/Operators.hpp"
-#include "RAJA/pattern/detail/algorithm.hpp"
+#include "RAJA/util/concepts.hpp"
 
 namespace RAJA
 {
@@ -46,16 +45,17 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<Container>>
 inclusive_scan_inplace(ExecPolicy&& p,
                        Res r,
                        Container&& c,
@@ -71,29 +71,28 @@ inclusive_scan_inplace(ExecPolicy&& p,
   if (begin(c) == end(c)) {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p),
-                                       begin(c), end(c), binop);
+  return impl::scan::inclusive_inplace(
+      r, std::forward<ExecPolicy>(p), begin(c), end(c), binop);
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
 inclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
                        Function binop = Function{})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      binop);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop);
 }
 
 /*!
@@ -113,12 +112,12 @@ template <typename ExecPolicy,
           typename Container,
           typename T = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<Container>>
 exclusive_scan_inplace(ExecPolicy&& p,
                        Res r,
                        Container&& c,
@@ -135,8 +134,8 @@ exclusive_scan_inplace(ExecPolicy&& p,
   if (begin(c) == end(c)) {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p),
-                                       begin(c), end(c), binop, value);
+  return impl::scan::exclusive_inplace(
+      r, std::forward<ExecPolicy>(p), begin(c), end(c), binop, value);
 }
 ///
 template <typename ExecPolicy,
@@ -144,11 +143,12 @@ template <typename ExecPolicy,
           typename T = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
 exclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
                        Function binop = Function{},
@@ -156,11 +156,7 @@ exclusive_scan_inplace(ExecPolicy&& p,
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      binop,
-      value);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop, value);
 }
 
 /*!
@@ -183,14 +179,15 @@ template <typename ExecPolicy,
           typename Res,
           typename InContainer,
           typename OutContainer,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<InContainer>,
-                      type_traits::is_range<OutContainer>>
+          typename Function =
+              operators::plus<RAJA::detail::ContainerVal<InContainer>>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<InContainer>,
+    type_traits::is_range<OutContainer>>
 inclusive_scan(ExecPolicy&& p,
                Res r,
                InContainer&& in,
@@ -210,21 +207,23 @@ inclusive_scan(ExecPolicy&& p,
   if (begin(in) == end(in)) {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p),
-                               begin(in), end(in), begin(out), binop);
+  return impl::scan::inclusive(
+      r, std::forward<ExecPolicy>(p), begin(in), end(in), begin(out), binop);
 }
 ///
 template <typename ExecPolicy,
           typename InContainer,
           typename OutContainer,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>,
+          typename Function =
+              operators::plus<RAJA::detail::ContainerVal<InContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<InContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
-                      type_traits::is_range<OutContainer>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<InContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, InContainer>>,
+    type_traits::is_range<OutContainer>>
 inclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
@@ -261,13 +260,13 @@ template <typename ExecPolicy,
           typename OutContainer,
           typename T = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<InContainer>,
-                      type_traits::is_range<OutContainer>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<InContainer>,
+    type_traits::is_range<OutContainer>>
 exclusive_scan(ExecPolicy&& p,
                Res r,
                InContainer&& in,
@@ -288,8 +287,13 @@ exclusive_scan(ExecPolicy&& p,
   if (begin(in) == end(in)) {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p),
-                               begin(in), end(in), begin(out), binop, value);
+  return impl::scan::exclusive(r,
+                               std::forward<ExecPolicy>(p),
+                               begin(in),
+                               end(in),
+                               begin(out),
+                               binop,
+                               value);
 }
 ///
 template <typename ExecPolicy,
@@ -298,12 +302,13 @@ template <typename ExecPolicy,
           typename T = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<InContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
-                      type_traits::is_range<OutContainer>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<InContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, InContainer>>,
+    type_traits::is_range<OutContainer>>
 exclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
@@ -320,7 +325,7 @@ exclusive_scan(ExecPolicy&& p,
       value);
 }
 
-}  // end inline namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 
 /*!
@@ -329,11 +334,11 @@ exclusive_scan(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type >
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -342,10 +347,9 @@ exclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan(
@@ -358,11 +362,11 @@ exclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -371,10 +375,9 @@ inclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan(
@@ -387,11 +390,11 @@ inclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -400,10 +403,9 @@ exclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
@@ -416,11 +418,11 @@ exclusive_scan_inplace(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -429,10 +431,9 @@ inclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
diff --git a/include/RAJA/pattern/sort.hpp b/include/RAJA/pattern/sort.hpp
index acf3fe5ba7..fd749c5a4a 100644
--- a/include/RAJA/pattern/sort.hpp
+++ b/include/RAJA/pattern/sort.hpp
@@ -18,15 +18,14 @@
 #ifndef RAJA_sort_HPP
 #define RAJA_sort_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iterator>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
+#include "RAJA/pattern/detail/algorithm.hpp"
 #include "RAJA/policy/PolicyBase.hpp"
-#include "RAJA/util/concepts.hpp"
 #include "RAJA/util/Operators.hpp"
-#include "RAJA/pattern/detail/algorithm.hpp"
+#include "RAJA/util/concepts.hpp"
 
 namespace RAJA
 {
@@ -46,23 +45,21 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-sort(ExecPolicy&& p,
-     Res r,
-     Container&& c,
-     Compare comp = Compare{})
+sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -70,35 +67,35 @@ sort(ExecPolicy&& p,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
   auto N = distance(begin_it, end_it);
 
   if (N > 1) {
-    return impl::sort::unstable(r, std::forward<ExecPolicy>(p),
-                                begin_it, end_it, comp);
+    return impl::sort::unstable(
+        r, std::forward<ExecPolicy>(p), begin_it, end_it, comp);
   } else {
     return resources::EventProxy<Res>(r);
   }
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
-sort(ExecPolicy&& p,
-     Container&& c,
-     Compare comp = Compare{})
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
+sort(ExecPolicy&& p, Container&& c, Compare comp = Compare{})
 {
   Res r = Res::get_default();
-  return ::RAJA::policy_by_value_interface::sort(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      comp);
+  return ::RAJA::policy_by_value_interface::sort(std::forward<ExecPolicy>(p),
+                                                 r,
+                                                 std::forward<Container>(c),
+                                                 comp);
 }
 
 /*!
@@ -113,23 +110,21 @@ sort(ExecPolicy&& p,
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-stable_sort(ExecPolicy&& p,
-            Res r,
-            Container&& c,
-            Compare comp = Compare{})
+stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -137,35 +132,33 @@ stable_sort(ExecPolicy&& p,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
   auto N = distance(begin_it, end_it);
 
   if (N > 1) {
-    return impl::sort::stable(r, std::forward<ExecPolicy>(p),
-                              begin_it, end_it, comp);
+    return impl::sort::stable(
+        r, std::forward<ExecPolicy>(p), begin_it, end_it, comp);
   } else {
     return resources::EventProxy<Res>(r);
   }
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
-stable_sort(ExecPolicy&& p,
-            Container&& c,
-            Compare comp = Compare{})
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
+stable_sort(ExecPolicy&& p, Container&& c, Compare comp = Compare{})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), comp);
 }
 
 /*!
@@ -185,7 +178,8 @@ template <typename ExecPolicy,
           typename Res,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -199,8 +193,8 @@ sort_pairs(ExecPolicy&& p,
            Compare comp = Compare{})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -210,12 +204,12 @@ sort_pairs(ExecPolicy&& p,
                 "ValContainer must model RandomAccessRange");
 
   auto begin_key = begin(keys);
-  auto end_key   = end(keys);
+  auto end_key = end(keys);
   auto N = distance(begin_key, end_key);
 
   if (N > 1) {
-    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p),
-                                      begin_key, end_key, begin(vals), comp);
+    return impl::sort::unstable_pairs(
+        r, std::forward<ExecPolicy>(p), begin_key, end_key, begin(vals), comp);
   } else {
     return resources::EventProxy<Res>(r);
   }
@@ -224,13 +218,16 @@ sort_pairs(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<KeyContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
-                      type_traits::is_range<ValContainer>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<KeyContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, KeyContainer>>,
+    type_traits::is_range<ValContainer>>
 sort_pairs(ExecPolicy&& p,
            KeyContainer&& keys,
            ValContainer&& vals,
@@ -262,7 +259,8 @@ template <typename ExecPolicy,
           typename Res,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -276,8 +274,8 @@ stable_sort_pairs(ExecPolicy&& p,
                   Compare comp = Compare{})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -287,12 +285,12 @@ stable_sort_pairs(ExecPolicy&& p,
                 "ValContainer must model RandomAccessRange");
 
   auto begin_key = begin(keys);
-  auto end_key   = end(keys);
+  auto end_key = end(keys);
   auto N = distance(begin_key, end_key);
 
   if (N > 1) {
-    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p),
-                                    begin_key, end_key, begin(vals), comp);
+    return impl::sort::stable_pairs(
+        r, std::forward<ExecPolicy>(p), begin_key, end_key, begin(vals), comp);
   } else {
     return resources::EventProxy<Res>(r);
   }
@@ -301,13 +299,16 @@ stable_sort_pairs(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<KeyContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
-                      type_traits::is_range<ValContainer>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<KeyContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, KeyContainer>>,
+    type_traits::is_range<ValContainer>>
 stable_sort_pairs(ExecPolicy&& p,
                   KeyContainer&& keys,
                   ValContainer&& vals,
@@ -322,7 +323,7 @@ stable_sort_pairs(ExecPolicy&& p,
       comp);
 }
 
-}  // end inline namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 // =============================================================================
 
@@ -332,11 +333,12 @@ stable_sort_pairs(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort(Args &&... args)
+sort(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort<ExecPolicy>(
@@ -347,10 +349,11 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort(Res r, Args &&... args)
+sort(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::sort(
-      ExecPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::sort(ExecPolicy(),
+                                                 r,
+                                                 std::forward<Args>(args)...);
 }
 
 /*!
@@ -359,11 +362,12 @@ sort(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort(Args &&... args)
+stable_sort(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort<ExecPolicy>(
@@ -374,7 +378,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort(Res r, Args &&... args)
+stable_sort(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -386,11 +390,12 @@ stable_sort(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort_pairs(Args &&... args)
+sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs<ExecPolicy>(
@@ -401,7 +406,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort_pairs(Res r, Args &&... args)
+sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -413,11 +418,12 @@ sort_pairs(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort_pairs(Args &&... args)
+stable_sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs<ExecPolicy>(
@@ -428,7 +434,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort_pairs(Res r, Args &&... args)
+stable_sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
diff --git a/include/RAJA/pattern/tensor.hpp b/include/RAJA/pattern/tensor.hpp
index 547f5bbad3..be8abbc32b 100644
--- a/include/RAJA/pattern/tensor.hpp
+++ b/include/RAJA/pattern/tensor.hpp
@@ -18,20 +18,16 @@
 #ifndef RAJA_pattern_tensor_HPP
 #define RAJA_pattern_tensor_HPP
 
-#include "RAJA/pattern/tensor/stats.hpp"
-#include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/pattern/tensor/MatrixRegister.hpp"
+#include "RAJA/pattern/tensor/ScalarRegister.hpp"
+#include "RAJA/pattern/tensor/TensorBlock.hpp"
 #include "RAJA/pattern/tensor/TensorIndex.hpp"
 #include "RAJA/pattern/tensor/TensorRegister.hpp"
-
-#include "RAJA/pattern/tensor/ScalarRegister.hpp"
 #include "RAJA/pattern/tensor/VectorRegister.hpp"
-#include "RAJA/pattern/tensor/MatrixRegister.hpp"
-
 #include "RAJA/pattern/tensor/internal/ExpressionTemplate.hpp"
 #include "RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp"
+#include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
 #include "RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp"
-
-
-#include "RAJA/pattern/tensor/TensorBlock.hpp"
+#include "RAJA/pattern/tensor/stats.hpp"
 
 #endif
diff --git a/include/RAJA/pattern/tensor/MatrixRegister.hpp b/include/RAJA/pattern/tensor/MatrixRegister.hpp
index 9fa39f34ee..3154b22807 100644
--- a/include/RAJA/pattern/tensor/MatrixRegister.hpp
+++ b/include/RAJA/pattern/tensor/MatrixRegister.hpp
@@ -18,35 +18,37 @@
 #ifndef RAJA_pattern_tensor_MatrixRegister_HPP
 #define RAJA_pattern_tensor_MatrixRegister_HPP
 
-#include "camp/camp.hpp"
 #include "RAJA/config.hpp"
-#include "RAJA/policy/tensor/arch.hpp"
 #include "RAJA/pattern/tensor/TensorRegister.hpp"
+#include "RAJA/policy/tensor/arch.hpp"
+#include "camp/camp.hpp"
 
 
 namespace RAJA
 {
 namespace expt
 {
-  template<typename T, typename LAYOUT, typename REGISTER_POLICY = default_register>
-  using SquareMatrixRegister =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     LAYOUT,
-                     camp::idx_seq<RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem,
-                                   RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem>>;
-
-  template<typename T, typename LAYOUT, camp::idx_t ROWS, camp::idx_t COLS,
-           typename REGISTER_POLICY = default_register>
-  using RectMatrixRegister =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     LAYOUT,
-                     camp::idx_seq<ROWS,COLS>>;
-
-} // namespace expt
+template <typename T,
+          typename LAYOUT,
+          typename REGISTER_POLICY = default_register>
+using SquareMatrixRegister = TensorRegister<
+    REGISTER_POLICY,
+    T,
+    LAYOUT,
+    camp::idx_seq<
+        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem,
+        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem>>;
+
+template <typename T,
+          typename LAYOUT,
+          camp::idx_t ROWS,
+          camp::idx_t COLS,
+          typename REGISTER_POLICY = default_register>
+using RectMatrixRegister =
+    TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<ROWS, COLS>>;
+
+}  // namespace expt
 }  // namespace RAJA
 
 
-
 #endif
diff --git a/include/RAJA/pattern/tensor/ScalarRegister.hpp b/include/RAJA/pattern/tensor/ScalarRegister.hpp
index f6675b4ba9..d532d58ade 100644
--- a/include/RAJA/pattern/tensor/ScalarRegister.hpp
+++ b/include/RAJA/pattern/tensor/ScalarRegister.hpp
@@ -28,16 +28,14 @@ namespace RAJA
 namespace expt
 {
 
-  // Convenience to describe ScalarTensors
-  template<typename T>
-  using ScalarRegister = TensorRegister<scalar_register,
-                                        T,
-                                        ScalarLayout,
-                                        camp::idx_seq<>>;
+// Convenience to describe ScalarTensors
+template <typename T>
+using ScalarRegister =
+    TensorRegister<scalar_register, T, ScalarLayout, camp::idx_seq<>>;
 
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorBlock.hpp b/include/RAJA/pattern/tensor/TensorBlock.hpp
index 0e9869a772..7e0078aea4 100644
--- a/include/RAJA/pattern/tensor/TensorBlock.hpp
+++ b/include/RAJA/pattern/tensor/TensorBlock.hpp
@@ -20,12 +20,10 @@
 #define RAJA_pattern_tensor_TensorBlock_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
-#include "camp/camp.hpp"
 #include "RAJA/pattern/tensor/TensorRegister.hpp"
 #include "RAJA/util/StaticLayout.hpp"
+#include "RAJA/util/macros.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -360,7 +358,6 @@ namespace ET{
 }  // namespace RAJA
 
 
-
 #endif
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index 8f152d92ce..c5843c5c5f 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -19,8 +19,8 @@
 #define RAJA_pattern_tensor_TensorIndex_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/index/IndexValue.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -29,196 +29,196 @@ namespace expt
 {
 
 
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, strip_index_type_t<IDX> INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndexInner;
-
-  template<typename INNER_TYPE>
-  struct StaticTensorIndex;
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-  class TensorIndex {
-    public:
-      using self_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type all(){
-        return self_type(index_type(-1), value_type(-1));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,value_type(-1),value_type(-1)>> static_all(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,value_type(-1),value_type(-1)>>();
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type range(index_type begin, index_type end){
-        return self_type(begin, value_type(stripIndexType(end-begin)));
-      }
-
-      template<value_type TBEGIN, value_type TEND>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>> static_range(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>>();
-      }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex() : m_index(index_type(0)), m_length(0) {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(RAJA::TypedRangeSegment<IDX> const &seg) :
-      m_index(*seg.begin()), m_length(seg.size())
-      {}
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(index_type value, value_type length) : m_index(value), m_length(length) {}
-
-      template<typename T, camp::idx_t D>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(TensorIndex<IDX, T, D> const &c) : m_index(*c), m_length(c.size()) {}
-
-
-      template<value_type IDX_VAL, value_type LEN_VAL>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(StaticTensorIndex<StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const RAJA_UNUSED_ARG(&c))
-          : m_index(IDX_VAL)
-          , m_length(LEN_VAL)
-      {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type const &operator*() const {
-        return m_index;
-      }
-
-      // used in strip_by_value as a static cast
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      explicit operator index_type() const {
-        // return does not matter, but suppresses no-return warnings
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type begin() const {
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type size() const {
-        return m_length;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type dim() const {
-        return DIM;
-      }
-
-    private:
-      index_type m_index;
-      value_type m_length;
-  };
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, strip_index_type_t<IDX> INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,INDEX_VALUE,LENGTH_VALUE>> {
-
-      using base_type  = TensorIndex<IDX,TENSOR_TYPE,DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      static const index_type s_index  = INDEX_VALUE;
-      static const index_type s_length = LENGTH_VALUE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr operator base_type() {
-        return base_type(s_index,s_length);
-      }
-    
-  };
-
-
-
-  /*!
-   * Index that specifies the starting element index of a Vector
-   */
-  template<typename IDX, typename VECTOR_TYPE>
-  using VectorIndex =  TensorIndex<IDX, VECTOR_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Row index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using RowIndex =  TensorIndex<IDX, MATRIX_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Column index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using ColIndex =  TensorIndex<IDX, MATRIX_TYPE, 1>;
-
-
-  /*!
-   * Converts a Row index to a Column index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          strip_index_type_t<IDX> INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndexInner;
+
+template <typename INNER_TYPE>
+struct StaticTensorIndex;
+
+
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+class TensorIndex
+{
+public:
+  using self_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type = strip_index_type_t<IDX>;
+  using index_type = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  RAJA_INLINE
   RAJA_HOST_DEVICE
+  static constexpr self_type all()
+  {
+    return self_type(index_type(-1), value_type(-1));
+  }
+
   RAJA_INLINE
-  constexpr
-  ColIndex<IDX, MATRIX_TYPE> toColIndex(RowIndex<IDX, MATRIX_TYPE> const &r){
-    return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+  RAJA_HOST_DEVICE
+  static constexpr StaticTensorIndex<StaticTensorIndexInner<IDX,
+                                                            TENSOR_TYPE,
+                                                            DIM,
+                                                            value_type(-1),
+                                                            value_type(-1)>>
+  static_all()
+  {
+    return StaticTensorIndex<StaticTensorIndexInner<IDX,
+                                                    TENSOR_TYPE,
+                                                    DIM,
+                                                    value_type(-1),
+                                                    value_type(-1)>>();
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr self_type range(index_type begin, index_type end)
+  {
+    return self_type(begin, value_type(stripIndexType(end - begin)));
   }
 
-  /*!
-   * Converts a Column index to a Row index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+  template <value_type TBEGIN, value_type TEND>
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr StaticTensorIndex<
+      StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>
+  static_range()
+  {
+    return StaticTensorIndex<
+        StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>();
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex() : m_index(index_type(0)), m_length(0) {}
+
+
+  RAJA_INLINE
   RAJA_HOST_DEVICE
+  constexpr TensorIndex(RAJA::TypedRangeSegment<IDX> const &seg)
+      : m_index(*seg.begin()), m_length(seg.size())
+  {
+  }
+
   RAJA_INLINE
-  constexpr
-  RowIndex<IDX, MATRIX_TYPE> toRowIndex(ColIndex<IDX, MATRIX_TYPE> const &c){
-    return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex(index_type value, value_type length)
+      : m_index(value), m_length(length)
+  {
   }
 
-} // namespace expt
+  template <typename T, camp::idx_t D>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorIndex(
+      TensorIndex<IDX, T, D> const &c)
+      : m_index(*c), m_length(c.size())
+  {
+  }
+
+
+  template <value_type IDX_VAL, value_type LEN_VAL>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorIndex(
+      StaticTensorIndex<
+          StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const
+          RAJA_UNUSED_ARG(&c))
+      : m_index(IDX_VAL), m_length(LEN_VAL)
+  {
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type const &operator*() const { return m_index; }
+
+  // used in strip_by_value as a static cast
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr explicit operator index_type() const
+  {
+    // return does not matter, but suppresses no-return warnings
+    return m_index;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type begin() const { return m_index; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr value_type size() const { return m_length; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr value_type dim() const { return DIM; }
+
+private:
+  index_type m_index;
+  value_type m_length;
+};
+
+
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          strip_index_type_t<IDX> INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndex<
+    StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>> {
+
+  using base_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type = strip_index_type_t<IDX>;
+  using index_type = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  static const index_type s_index = INDEX_VALUE;
+  static const index_type s_length = LENGTH_VALUE;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr operator base_type() { return base_type(s_index, s_length); }
+};
+
+
+/*!
+ * Index that specifies the starting element index of a Vector
+ */
+template <typename IDX, typename VECTOR_TYPE>
+using VectorIndex = TensorIndex<IDX, VECTOR_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Row index of a matrix
+ */
+template <typename IDX, typename MATRIX_TYPE>
+using RowIndex = TensorIndex<IDX, MATRIX_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Column index of a matrix
+ */
+template <typename IDX, typename MATRIX_TYPE>
+using ColIndex = TensorIndex<IDX, MATRIX_TYPE, 1>;
+
+
+/*!
+ * Converts a Row index to a Column index
+ */
+template <typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr ColIndex<IDX, MATRIX_TYPE> toColIndex(
+    RowIndex<IDX, MATRIX_TYPE> const &r)
+{
+  return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+}
+
+/*!
+ * Converts a Column index to a Row index
+ */
+template <typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr RowIndex<IDX, MATRIX_TYPE> toRowIndex(
+    ColIndex<IDX, MATRIX_TYPE> const &c)
+{
+  return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+}
+
+}  // namespace expt
 }  // namespace RAJA
 
 #include "RAJA/pattern/tensor/internal/TensorIndexTraits.hpp"
diff --git a/include/RAJA/pattern/tensor/TensorLayout.hpp b/include/RAJA/pattern/tensor/TensorLayout.hpp
index 376d6b905a..55046cc8f7 100644
--- a/include/RAJA/pattern/tensor/TensorLayout.hpp
+++ b/include/RAJA/pattern/tensor/TensorLayout.hpp
@@ -28,67 +28,54 @@ namespace expt
 {
 
 
-  template<camp::idx_t ... DIM_SEQ>
-  struct TensorLayout : public camp::idx_seq<DIM_SEQ...>
-  {
+template <camp::idx_t... DIM_SEQ>
+struct TensorLayout : public camp::idx_seq<DIM_SEQ...> {
 
-      using seq_t = camp::idx_seq<DIM_SEQ...>;
+  using seq_t = camp::idx_seq<DIM_SEQ...>;
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_column_major(){
-        return false;
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_column_major() { return false; }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_row_major(){
-        return false;
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_row_major() { return false; }
+};
 
-  };
 
+// specialization for Matrix layouts, where column vs row major matters
+template <camp::idx_t S2, camp::idx_t S1>
+struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1> {
+  using seq_t = camp::idx_seq<S2, S1>;
 
-  // specialization for Matrix layouts, where column vs row major matters
-  template<camp::idx_t S2, camp::idx_t S1>
-  struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_column_major()
   {
-      using seq_t = camp::idx_seq<S2, S1>;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_column_major(){
-        return S1 == 0; // Rows are stride-1
-      }
+    return S1 == 0;  // Rows are stride-1
+  }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_row_major(){
-        return S1 == 1; // Columns are stride-1
-      }
-  };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_row_major()
+  {
+    return S1 == 1;  // Columns are stride-1
+  }
+};
 
 
-  // 0d tensor (scalar) layout
-  using ScalarLayout = TensorLayout<>;
+// 0d tensor (scalar) layout
+using ScalarLayout = TensorLayout<>;
 
-  // 1d tensor (vector) layout
-  using VectorLayout = TensorLayout<0>;
+// 1d tensor (vector) layout
+using VectorLayout = TensorLayout<0>;
 
-  // 2d tensor (matrix) layouts
-  using RowMajorLayout = TensorLayout<0, 1>;
-  using ColMajorLayout = TensorLayout<1, 0>;
+// 2d tensor (matrix) layouts
+using RowMajorLayout = TensorLayout<0, 1>;
+using ColMajorLayout = TensorLayout<1, 0>;
 
 
-} // namespace expt
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/TensorRegister.hpp b/include/RAJA/pattern/tensor/TensorRegister.hpp
index d410f46fb7..fc7f7c0195 100644
--- a/include/RAJA/pattern/tensor/TensorRegister.hpp
+++ b/include/RAJA/pattern/tensor/TensorRegister.hpp
@@ -19,89 +19,98 @@
 #define RAJA_pattern_tensor_TensorRegister_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
-#include "camp/camp.hpp"
 #include "RAJA/pattern/tensor/TensorLayout.hpp"
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
+#include "RAJA/util/macros.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
-namespace internal {
-namespace expt {
-    class TensorRegisterConcreteBase;
-}
+namespace internal
+{
+namespace expt
+{
+class TensorRegisterConcreteBase;
 }
+}  // namespace internal
 
 namespace expt
 {
 
 
-  template<typename REGISTER_POLICY,
-           typename T,
-           typename LAYOUT,
-           typename SIZES>
-  class TensorRegister;
+template <typename REGISTER_POLICY, typename T, typename LAYOUT, typename SIZES>
+class TensorRegister;
 
 
-  /*
-   * Overload for:    arithmetic + TensorRegister
+/*
+ * Overload for:    arithmetic + TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).add(rhs);
-  }
-
-  /*
-   * Overload for:    arithmetic - TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
+{
+  return RIGHT(lhs).add(rhs);
+}
+
+/*
+ * Overload for:    arithmetic - TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).subtract(rhs);
-  }
-
-  /*
-   * Overload for:    arithmetic * TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
+{
+  return RIGHT(lhs).subtract(rhs);
+}
+
+/*
+ * Overload for:    arithmetic * TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return rhs.scale(lhs);
-  }
-
-  /*
-   * Overload for:    arithmetic / TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
+{
+  return rhs.scale(lhs);
+}
+
+/*
+ * Overload for:    arithmetic / TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).divide(rhs);
-  }
-
-} // namespace expt
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
+{
+  return RIGHT(lhs).divide(rhs);
+}
+
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/VectorRegister.hpp b/include/RAJA/pattern/tensor/VectorRegister.hpp
index afab05658f..6be58576ee 100644
--- a/include/RAJA/pattern/tensor/VectorRegister.hpp
+++ b/include/RAJA/pattern/tensor/VectorRegister.hpp
@@ -24,16 +24,15 @@ namespace RAJA
 {
 namespace expt
 {
-  // Convenience to describe VectorTensors
-  template<typename T, typename REGISTER_POLICY = default_register, camp::idx_t NUM_ELEM = Register<T,REGISTER_POLICY>::s_num_elem>
-  using VectorRegister = TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        VectorLayout,
-                                        camp::idx_seq<NUM_ELEM> >;
-} // namespace expt
-
-} // namespace RAJA
-
+// Convenience to describe VectorTensors
+template <typename T,
+          typename REGISTER_POLICY = default_register,
+          camp::idx_t NUM_ELEM = Register<T, REGISTER_POLICY>::s_num_elem>
+using VectorRegister =
+    TensorRegister<REGISTER_POLICY, T, VectorLayout, camp::idx_seq<NUM_ELEM> >;
+}  // namespace expt
+
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
index 953f4fd4a0..fed360d44c 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
@@ -19,11 +19,9 @@
 #define RAJA_pattern_tensor_ET_TensorAdd_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
-#include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 #include "RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp"
+#include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -34,110 +32,122 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+class TensorBinaryOperator
+    : public TensorExpressionBase<
+          TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
+{
+public:
+  using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
+  using operator_type = OPERATOR;
+  using left_operand_type = LEFT_OPERAND;
+  using right_operand_type = RIGHT_OPERAND;
+
+  using element_type = typename LEFT_OPERAND::element_type;
+  using index_type = typename LEFT_OPERAND::index_type;
+
+  using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
+  using result_type = typename operator_traits::result_type;
+
+  static constexpr camp::idx_t s_num_dims = operator_traits::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorBinaryOperator(left_operand_type const &left,
+                       right_operand_type const &right)
+      : m_left_operand{left}, m_right_operand{right}
+  {
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr auto getDimSize(camp::idx_t dim) const
+      -> decltype(operator_traits::getDimSize(dim,
+                                              m_left_operand,
+                                              m_right_operand))
+  {
+    return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const &tile) const
+      -> decltype(operator_type::eval(m_left_operand.eval(tile),
+                                      m_right_operand.eval(tile)))
+  {
+    return operator_type::eval(m_left_operand.eval(tile),
+                               m_right_operand.eval(tile));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
   {
+    operator_type::print_ast();
+    printf("[");
+    operator_type::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+/*
+ * Overload for:    arithmetic + tensorexpression
+
+ */
+template <typename LEFT_OPERAND,
+          typename RIGHT_OPERAND,
+          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                  bool>::type = true,
+          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                  RIGHT_OPERAND>::value,
+                                  bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator+(LEFT_OPERAND const &left,
+                                            RIGHT_OPERAND const &right)
+    -> TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                 RIGHT_OPERAND>
+{
+  return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                   RIGHT_OPERAND>(
+      NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+}
 
 
-    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-    class TensorBinaryOperator :
-        public TensorExpressionBase<TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
-    {
-      public:
-        using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
-        using operator_type = OPERATOR;
-        using left_operand_type = LEFT_OPERAND;
-        using right_operand_type = RIGHT_OPERAND;
-
-        using element_type = typename LEFT_OPERAND::element_type;
-        using index_type = typename LEFT_OPERAND::index_type;
-
-        using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
-        using result_type = typename operator_traits::result_type;
-
-        static constexpr camp::idx_t s_num_dims =
-            operator_traits::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorBinaryOperator(left_operand_type const &left, right_operand_type const &right) :
-        m_left_operand{left}, m_right_operand{right}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        auto getDimSize(camp::idx_t dim) const ->
-        decltype(operator_traits::getDimSize(dim, m_left_operand, m_right_operand))
-        {
-          return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile)))
-        {
-          return operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          operator_type::print_ast();
-          printf("[");
-          operator_type::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-
-
-    /*
-     * Overload for:    arithmetic + tensorexpression
-
-     */
-    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator+(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-    TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
-    {
-      return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
-    }
-
-
-    /*
-     * Overload for:    arithmetic - tensorexpression
-
-     */
-    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator-(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-    TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
-    {
-      return TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
-    }
+/*
+ * Overload for:    arithmetic - tensorexpression
+
+ */
+template <typename LEFT_OPERAND,
+          typename RIGHT_OPERAND,
+          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                  bool>::type = true,
+          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                  RIGHT_OPERAND>::value,
+                                  bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const &left,
+                                            RIGHT_OPERAND const &right)
+    -> TensorSubtract<
+        typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+        RIGHT_OPERAND>
+{
+  return TensorSubtract<
+      typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
+                     right);
+}
 
 
 //    /*
@@ -145,21 +155,27 @@ namespace expt
 //
 //     */
 //    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-//      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
+//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+//      bool>::type = true, typename
+//      std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+//      RIGHT_OPERAND>::value, bool>::type = true>
 //    RAJA_INLINE
 //    RAJA_HOST_DEVICE
 //    auto operator/(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
+//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+//    RIGHT_OPERAND>
 //    {
-//      return TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+//      return TensorDivide<typename
+//      NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+//      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
+//      right);
 //    }
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
index a1450bf19f..6fde1a2d6c 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
@@ -27,159 +27,130 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-    struct TensorOperatorAdd
-    {
-
-      template<typename LEFT, typename RIGHT>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      auto eval(LEFT const &left, RIGHT const &right) ->
-        decltype(left + right)
-      {
-        return left + right;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast(){
-        printf("Add");
-      }
-    };
-
-    struct TensorOperatorSubtract
-    {
-
-      template<typename LEFT, typename RIGHT>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      auto eval(LEFT const &left, RIGHT const &right) ->
-        decltype(left - right)
-      {
-        return left - right;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast(){
-        printf("Subtract");
-      }
-    };
-
-
-
-
-
-    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-    class TensorBinaryOperator;
-
-    template<typename LHS, typename RHS>
-    using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
-
-    template<typename LHS, typename RHS>
-    using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
-
-
+namespace ET
+{
 
+struct TensorOperatorAdd {
 
-    /*!
-     * Provides default operations for add, subtract and divide
-     *
-     * For the most part, this is just element wise operations between
-     * compatible tensors.
-     *
-     * There are specializations that handle when one operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
-    struct OperatorTraits {
+  template <typename LEFT, typename RIGHT>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const &left,
+                                                RIGHT const &right)
+      -> decltype(left + right)
+  {
+    return left + right;
+  }
 
-        using result_type = typename LHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Add"); }
+};
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Elemental");
-        }
+struct TensorOperatorSubtract {
 
+  template <typename LEFT, typename RIGHT>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const &left,
+                                                RIGHT const &right)
+      -> decltype(left - right)
+  {
+    return left - right;
+  }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &rhs) {
-          return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
-        }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Subtract"); }
+};
 
-    };
 
-    /*!
-     * Specialization when the left operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
-    typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
-    {
+template <typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+class TensorBinaryOperator;
 
-        using result_type = typename RHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
+template <typename LHS, typename RHS>
+using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scalar");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &, RHS_TYPE const &rhs) {
-          return rhs.getDimSize(dim);
-        }
+template <typename LHS, typename RHS>
+using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
 
-    };
 
-    /*!
-     * Specialization when the right operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
-    typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
-    {
+/*!
+ * Provides default operations for add, subtract and divide
+ *
+ * For the most part, this is just element wise operations between
+ * compatible tensors.
+ *
+ * There are specializations that handle when one operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
+struct OperatorTraits {
 
-        using result_type = typename LHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+  using result_type = typename LHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scalar");
-        }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Elemental"); }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &) {
-          return lhs.getDimSize(dim);
-        }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &rhs)
+  {
+    return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
+  }
+};
 
+/*!
+ * Specialization when the left operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE>
+struct OperatorTraits<
+    LHS_TYPE,
+    RHS_TYPE,
+    typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type> {
+
+  using result_type = typename RHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scalar"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const &, RHS_TYPE const &rhs)
+  {
+    return rhs.getDimSize(dim);
+  }
+};
 
-    };
+/*!
+ * Specialization when the right operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE>
+struct OperatorTraits<
+    LHS_TYPE,
+    RHS_TYPE,
+    typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type> {
+
+  using result_type = typename LHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scalar"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &)
+  {
+    return lhs.getDimSize(dim);
+  }
+};
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
index 210414eaec..e75af79591 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
@@ -19,12 +19,9 @@
 #define RAJA_pattern_tensor_ET_BlockLiteral_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
-
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -35,93 +32,91 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-
-    /*!
-     * Temporary n-dimensional memory.
-     *
-     * STORAGE_TYPE defines the memory storage
-     * TENSOR_TYPE defines what kind of tensor is returned by eval()
-     */
-    template<typename STORAGE_TYPE, typename TENSOR_TYPE>
-    class BlockLiteral :  public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>> {
-      public:
-        using self_type = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
-        using storage_type = STORAGE_TYPE;
-        using tensor_type = TENSOR_TYPE;
-        using result_type = TENSOR_TYPE;
-        using ref_type = typename STORAGE_TYPE::ref_type;
-        using tile_type = typename ref_type::tile_type;
-        using index_type = camp::idx_t;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-
-      private:
-        storage_type m_storage;
-        tile_type m_tile_origin;
-
-      public:
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return storage_type::s_dim_elem(dim);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        BlockLiteral(tile_type tile_origin) :
-          m_storage(),
-          m_tile_origin(tile_origin)
-        {
+namespace ET
+{
 
-        }
 
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const {
-          result_type result;
+/*!
+ * Temporary n-dimensional memory.
+ *
+ * STORAGE_TYPE defines the memory storage
+ * TENSOR_TYPE defines what kind of tensor is returned by eval()
+ */
+template <typename STORAGE_TYPE, typename TENSOR_TYPE>
+class BlockLiteral
+    : public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>>
+{
+public:
+  using self_type = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
+  using storage_type = STORAGE_TYPE;
+  using tensor_type = TENSOR_TYPE;
+  using result_type = TENSOR_TYPE;
+  using ref_type = typename STORAGE_TYPE::ref_type;
+  using tile_type = typename ref_type::tile_type;
+  using index_type = camp::idx_t;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+
+private:
+  storage_type m_storage;
+  tile_type m_tile_origin;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return storage_type::s_dim_elem(dim);
+  }
 
-          // load result from storage
-          result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr BlockLiteral(tile_type tile_origin)
+      : m_storage(), m_tile_origin(tile_origin)
+  {
+  }
 
-          return result;
-        }
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const &tile) const
+  {
+    result_type result;
 
+    // load result from storage
+    result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
 
-        /*!
-         *  Returns a ref that points at this data, shifted by its origin
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        ref_type get_ref() {
+    return result;
+  }
 
-          // compute shifited origin ref
-          return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
 
-        }
+  /*!
+   *  Returns a ref that points at this data, shifted by its origin
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  ref_type get_ref()
+  {
 
+    // compute shifited origin ref
+    return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
+  }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("BlockLiteral()");
-        }
 
-    };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("BlockLiteral()"); }
+};
 
 
 //    /*
-//     * For TensorRegister nodes, we need to wrap this in a constant value ET node
+//     * For TensorRegister nodes, we need to wrap this in a constant value ET
+//     node
 //     */
 //    template<typename RHS>
 //    struct NormalizeOperandHelper<RHS,
-//    typename std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase, RHS>::value>::type>
+//    typename
+//    std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase,
+//    RHS>::value>::type>
 //    {
 //        using return_type = BlockLiteral<RHS>;
 //
@@ -134,10 +129,10 @@ namespace expt
 //        }
 //    };
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
index 3e96a63462..8e3cd7583f 100644
--- a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
@@ -19,13 +19,10 @@
 #define RAJA_pattern_tensor_ET_ExpressionTemplateBase_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
-#include "RAJA/pattern/tensor/internal/TensorRef.hpp"
-
-#include "RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp"
 #include "RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp"
+#include "RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp"
+#include "RAJA/pattern/tensor/internal/TensorRef.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 //#define RAJA_DEBUG_PRINT_ET_AST
@@ -38,128 +35,126 @@ namespace expt
 {
 
 
-    class TensorRegisterConcreteBase;
+class TensorRegisterConcreteBase;
+
+namespace ET
+{
+
+//
+// forward decls
+//
+
+template <typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
+class TensorLoadStore;
+
+
+template <typename LHS_TYPE, typename RHS_TYPE>
+class TensorMultiply;
+
+template <typename LHS_TYPE, typename RHS_TYPE>
+class TensorDivide;
 
-  namespace ET
+template <typename TENSOR_TYPE>
+class TensorNegate;
+
+template <typename TENSOR_TYPE>
+class TensorTranspose;
+
+
+// provides a non-templated base-type for all ET's
+// this allows using things like std::is_base_of
+class TensorExpressionConcreteBase
+{
+};
+
+
+template <typename DERIVED_TYPE>
+class TensorExpressionBase : public TensorExpressionConcreteBase
+{
+public:
+  using self_type = DERIVED_TYPE;
+
+private:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type *getThis() { return static_cast<self_type *>(this); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr self_type const *getThis() const
   {
+    return static_cast<self_type const *>(this);
+  }
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr camp::idx_t getDimBegin(camp::idx_t) const { return 0; }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorAdd<self_type, normalize_operand_t<RHS>>
+  operator+(RHS const &rhs) const
+  {
+    return TensorAdd<self_type, normalize_operand_t<RHS>>(*getThis(),
+                                                          normalizeOperand(
+                                                              rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE
+      TensorSubtract<self_type, normalize_operand_t<RHS>>
+      operator-(RHS const &rhs) const
+  {
+    return TensorSubtract<self_type, normalize_operand_t<RHS>>(*getThis(),
+                                                               normalizeOperand(
+                                                                   rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorNegate<self_type> operator-() const
+  {
+    return TensorNegate<self_type>(*getThis());
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE
+      TensorMultiply<self_type, normalize_operand_t<RHS>>
+      operator*(RHS const &rhs) const
+  {
+    return TensorMultiply<self_type, normalize_operand_t<RHS>>(*getThis(),
+                                                               normalizeOperand(
+                                                                   rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorDivide<self_type, normalize_operand_t<RHS>>
+  operator/(RHS const &rhs) const
+  {
+    return TensorDivide<self_type, normalize_operand_t<RHS>>(*getThis(),
+                                                             normalizeOperand(
+                                                                 rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorTranspose<self_type> transpose() const
+  {
+    return TensorTranspose<self_type>(*getThis());
+  }
+};
+
+
+}  // namespace ET
 
-    //
-    // forward decls
-    //
-
-    template<typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
-    class TensorLoadStore;
-
-
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    class TensorMultiply;
-
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    class TensorDivide;
-
-    template<typename TENSOR_TYPE>
-    class TensorNegate;
-
-    template<typename TENSOR_TYPE>
-    class TensorTranspose;
-
-
-
-
-    // provides a non-templated base-type for all ET's
-    // this allows using things like std::is_base_of
-    class TensorExpressionConcreteBase{};
-
-
-    template<typename DERIVED_TYPE>
-    class TensorExpressionBase :public TensorExpressionConcreteBase {
-      public:
-        using self_type = DERIVED_TYPE;
-
-      private:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        self_type *getThis(){
-          return static_cast<self_type*>(this);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        self_type const *getThis() const {
-          return static_cast<self_type const*>(this);
-        }
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        camp::idx_t getDimBegin(camp::idx_t ) const
-        {
-          return 0;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorAdd<self_type, normalize_operand_t<RHS> >
-        operator+(RHS const &rhs) const {
-          return TensorAdd<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorSubtract<self_type, normalize_operand_t<RHS>>
-        operator-(RHS const &rhs) const {
-          return TensorSubtract<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorNegate<self_type>
-        operator-() const {
-          return TensorNegate<self_type>(*getThis());
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiply<self_type, normalize_operand_t<RHS>>
-        operator*(RHS const &rhs) const {
-          return TensorMultiply<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorDivide<self_type, normalize_operand_t<RHS>>
-        operator/(RHS const &rhs) const {
-          return TensorDivide<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorTranspose<self_type>
-        transpose() const {
-          return TensorTranspose<self_type>(*getThis());
-        }
-
-    };
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index e7e7223ce4..5e6cd0ca26 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -20,1210 +20,1220 @@
 #define RAJA_pattern_tensor_ET_MultiplyOperator_HPP
 
 
-
 namespace RAJA
 {
 namespace internal
 {
 namespace expt
 {
-  //forward
-  class TensorBlockConcreteBase;
+// forward
+class TensorBlockConcreteBase;
+
 
+namespace ET
+{
 
 
+/*!
+ * Provides default multiply, multiply add, and multiply subtract
+ * operations.
+ *
+ * If the operands are both matrices, we perform a matrix-matrix multiply.
+ * Otherwise, we perform element-wise operations.
+ */
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          class ENABLE = void>
+struct MultiplyOperator {
 
-  namespace ET
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast()
   {
+    printf("Elemental(%d,%d)",
+           (int)s_num_dims,
+           (int)RIGHT_OPERAND_TYPE::s_num_dims);
+  }
 
 
-    /*!
-     * Provides default multiply, multiply add, and multiply subtract
-     * operations.
-     *
-     * If the operands are both matrices, we perform a matrix-matrix multiply.
-     * Otherwise, we perform element-wise operations.
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
-    struct MultiplyOperator
-    {
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &left,
+                        RIGHT_OPERAND_TYPE const &right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
+      -> decltype(left.eval(tile) * right.eval(tile))
+  {
+    return left.eval(tile) * right.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right,
+      ADD_OPERAND_TYPE const &add)
+      -> decltype(left.eval(tile).multiply_add(right.eval(tile),
+                                               add.eval(tile)))
+  {
+    return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right,
+      SUBTRACT_OPERAND_TYPE const &subtract)
+      -> decltype(left.eval(tile).multiply_subtract(right.eval(tile),
+                                                    subtract.eval(tile)))
+  {
+    return left.eval(tile).multiply_subtract(right.eval(tile),
+                                             subtract.eval(tile));
+  }
+};
 
-        using result_type = typename LEFT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Elemental(%d,%d)", (int)s_num_dims, (int)RIGHT_OPERAND_TYPE::s_num_dims);
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-        }
-
-        /*!
-         * Evaluate operands and perform element-wise multiply
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(left.eval(tile) * right.eval(tile))
-        {
-          return left.eval(tile) * right.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(left.eval(tile).multiply_add(right.eval(tile), add.eval(tile)))
-        {
-          return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile)))
-        {
-          return left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile));
-        }
-
-
-    };
-
-
-    /*!
-     * Specialization that provides multiplying a scalar * tensor
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0>::type>
-    {
 
-        using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scale");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-          return right.getDimSize(dim);
-        }
-
-        /*!
-         * Evaluate operands and perform scaling operation
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(right.eval(tile).scale(left.eval(tile)))
-        {
-          return right.eval(tile).scale(left.eval(tile));
-        }
-
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
-        {
-          return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
-        {
-          return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
-        }
-    };
-
-
-    /*!
-     * Specialization that provides multiplying a tensor*scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
-    {
+/*!
+ * Specialization that provides multiplying a scalar * tensor
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0>::type> {
+
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scale"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &,
+                        RIGHT_OPERAND_TYPE const &right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform scaling operation
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
+      -> decltype(right.eval(tile).scale(left.eval(tile)))
+  {
+    return right.eval(tile).scale(left.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right,
+      ADD_OPERAND_TYPE const &add)
+      -> decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
+  {
+    return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right,
+      SUBTRACT_OPERAND_TYPE const &subtract)
+      -> decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
+  {
+    return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
+  }
+};
 
-        using result_type = typename LEFT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scale");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-          return left.getDimSize(dim);
-        }
-
-        /*!
-         * Evaluate operands and perform scaling operation
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(left.eval(tile).scale(right.eval(tile)))
-        {
-          return left.eval(tile).scale(right.eval(tile));
-        }
-
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
-        {
-          return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
-        {
-          return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
-        }
-    };
-
-
-    /*!
-     * Specialization for matrix-vector right multiplication.
-     *
-     * By default the A*x operator for two matrices produces a matrix-vector
-     * multiplication.
-     *
-     * The right hand side vector is always treated as a column vector.
-     *
-     * The resulting vector type is inherited from the RHS
-     *
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==1>::type>
-    {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
-      static constexpr camp::idx_t s_num_dims = 1;
+/*!
+ * Specialization that provides multiplying a tensor*scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<RIGHT_OPERAND_TYPE::s_num_dims == 0>::type> {
+
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scale"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &left,
+                        RIGHT_OPERAND_TYPE const &)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform scaling operation
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
+      -> decltype(left.eval(tile).scale(right.eval(tile)))
+  {
+    return left.eval(tile).scale(right.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right,
+      ADD_OPERAND_TYPE const &add)
+      -> decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
+  {
+    return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right,
+      SUBTRACT_OPERAND_TYPE const &subtract)
+      -> decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
+  {
+    return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
+  }
+};
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Matrx*Vector");
-      }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return dim == 0 ? right.getDimSize(0) : 0;
-      }
+/*!
+ * Specialization for matrix-vector right multiplication.
+ *
+ * By default the A*x operator for two matrices produces a matrix-vector
+ * multiplication.
+ *
+ * The right hand side vector is always treated as a column vector.
+ *
+ * The resulting vector type is inherited from the RHS
+ *
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type> {
+
+  using left_type = LEFT_OPERAND_TYPE;
+  using right_type = RIGHT_OPERAND_TYPE;
+  using result_type =
+      typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
+  static constexpr camp::idx_t s_num_dims = 1;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Vector"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &,
+                        RIGHT_OPERAND_TYPE const &right)
+  {
+    return dim == 0 ? right.getDimSize(0) : 0;
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
+  {
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
+    // clear result
+    result_type result(0);
 
-        // clear result
-        result_type result(0);
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+    return result;
+  }
 
-        return result;
-      }
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right,
+      ADD_TYPE const &add)
+  {
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
+    // evaluate add into result
+    result_type result = add.eval(tile);
 
-        // evaluate add into result
-        result_type result = add.eval(tile);
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+    return result;
+  }
 
-        return result;
-      }
+private:
+  template <typename STORAGE, typename TILE_TYPE, typename INDEX = void>
+  struct MultiplyBridge;
 
-    private:
-
-      template<typename STORAGE, typename TILE_TYPE, typename INDEX=void>
-      struct MultiplyBridge;
-
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-      {
-        //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-
-        // get tile size from matrix type
-        auto tile_size = left_type::result_type::s_dim_elem(1);
-        auto k_size = et_left.getDimSize(1);
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
-
-        // tile over row of left and column of right
-        auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-        left_tile.m_begin[0] = tile.m_begin[0];
-        left_tile.m_size[0] = tile.m_size[0];
-        left_tile.m_size[1] = tile_size;
-
-        using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-        RightType right_tile = tile;
-        right_tile.m_size[0] = tile_size;
-
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
-
-          // evaluate both sides of operator
-          left_tile.m_begin[1] = k;
-          auto left = et_left.eval(left_tile);
-
-          right_tile.m_begin[0] = k;
-          auto right = et_right.eval(right_tile);
-
-          // accumulate product
-          result = left.right_multiply_vector_accumulate(right, result);
-        }
-        // remainder tile in k
-        if(k < k_size){
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[1] = k;
-          left_part_tile.m_size[1] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
-
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
-
-          // accumulate product of partial tile
-          result = left.right_multiply_vector_accumulate(right, result);
-        }
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE &result,
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &et_left,
+      RIGHT_OPERAND_TYPE const &et_right)
+  {
+    // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
 
-      }
+    // get tile size from matrix type
+    auto tile_size = left_type::result_type::s_dim_elem(1);
+    auto k_size = et_left.getDimSize(1);
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
 
+    // tile over row of left and column of right
+    auto left_tile =
+        LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+    left_tile.m_begin[0] = tile.m_begin[0];
+    left_tile.m_size[0] = tile.m_size[0];
+    left_tile.m_size[1] = tile_size;
 
-      template<typename T>
-      struct Diag{
-          static_assert(!std::is_same<T,void>::value,"diag");
-      };
-
-      template<typename I, TensorTileSize TTS, typename B, typename S>
-      struct Diag< StaticTensorTile<I,TTS,B,S> >{
-          static_assert(std::is_same<I,void>::value,"diag");
-      };
-
-      template<typename STORAGE, typename TILE_TYPE, typename INDEX>
-      struct MultiplyBridge {
-
-          Diag<TILE_TYPE> diag;
-
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-            //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-    
-            // get tile size from matrix type
-            auto tile_size = left_type::result_type::s_dim_elem(1);
-            auto k_size = et_left.getDimSize(1);
-            // TODO: check that left and right are compatible
-            // m_left.getDimSize(1) == m_right.getDimSize(0)
-            // how do we provide checking for this kind of error?
-    
-            // tile over row of left and column of right
-            auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-            left_tile.m_begin[0] = tile.m_begin[0];
-            left_tile.m_size[0] = tile.m_size[0];
-            left_tile.m_size[1] = tile_size;
-    
-            using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-            RightType right_tile = tile;
-            right_tile.m_size[0] = tile_size;
-    
-            // Do full tiles in k
-            decltype(k_size) k = 0;
-            for(;k+tile_size <= k_size; k+= tile_size){
-    
-              // evaluate both sides of operator
-              left_tile.m_begin[1] = k;
-              auto left = et_left.eval(left_tile);
-    
-              right_tile.m_begin[0] = k;
-              auto right = et_right.eval(right_tile);
-    
-              // accumulate product
-              result = left.right_multiply_vector_accumulate(right, result);
-            }
-            // remainder tile in k
-            if(k < k_size){
-              auto &left_part_tile = make_tensor_tile_partial(left_tile);
-              left_part_tile.m_begin[1] = k;
-              left_part_tile.m_size[1] = k_size-k;
-              auto left = et_left.eval(left_part_tile);
-    
-              auto &right_part_tile = make_tensor_tile_partial(right_tile);
-              right_part_tile.m_begin[0] = k;
-              right_part_tile.m_size[0] = k_size-k;
-              auto right = et_right.eval(right_part_tile);
-    
-              // accumulate product of partial tile
-              result = left.right_multiply_vector_accumulate(right, result);
-            }
-    
-          }
-      };
-
-
-
-
-      template<
-          size_t INDEX,
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0, INDEX_TYPE... SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          camp::integral_constant<size_t,INDEX>
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              // get tile size from matrix type
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-             
-              auto const offset = INDEX*tile_size;
-
-              if( (offset + tile_size) <= k_size ) {
-    
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
-                    >;
-                    // evaluate both sides of operator
-                    auto left = et_left.eval(LeftType());
-
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE,    offset>,
-                        camp::int_seq<INDEX_TYPE, tile_size>
-                    >;
-    
-                    auto right = et_right.eval(RightType());
-    
-                    // accumulate product
-                    auto temp = left.right_multiply_vector_accumulate(right, result);
-                    MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,INDEX-1>>::multiply_into_result(result,tile,et_left,et_right);
-                    result += temp;
-                    
-              } else {
-
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
-                    >;
-		    auto left = et_left.eval(LeftType());
-	    
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE,        offset>,
-                        camp::int_seq<INDEX_TYPE, k_size-offset>
-                    >;
-		    auto right = et_right.eval(RightType());
-	    
-		    // accumulate product of partial tile
-		    result = left.right_multiply_vector_accumulate(right, result);
-
-              }
-
-
-            }
-          };
-
-
-      template<
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0, INDEX_TYPE...  SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          camp::integral_constant<size_t,0>
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              // get tile size from matrix type
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-             
-              auto const offset = 0;
-
-              if( (offset + tile_size) <= k_size ) {
-    
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
-                    >;
-                    // evaluate both sides of operator
-                    auto left = et_left.eval(LeftType());
-
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE,    offset>,
-                        camp::int_seq<INDEX_TYPE, tile_size>
-                    >;
-    
-                    auto right = et_right.eval(RightType());
-    
-                    // accumulate product
-                    auto temp = left.right_multiply_vector_accumulate(right, result);
-                    result += temp;
-                    
-              } else {
-
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
-                    >;
-		    auto left = et_left.eval(LeftType());
-	    
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE,        offset>,
-                        camp::int_seq<INDEX_TYPE, k_size-offset>
-                    >;
-		    auto right = et_right.eval(RightType());
-	    
-		    // accumulate product of partial tile
-		    result = left.right_multiply_vector_accumulate(right, result);
-
-              }
-
-
-            }
-          };
-
-      template<
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0,  INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0,  INDEX_TYPE... SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          void
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-              const size_t iter_count = (k_size/tile_size) + ( (k_size%tile_size != 0) ? 1 : 0 );
-
-              MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,iter_count>>::multiply_into_result(result,tile,et_left,et_right);
-
-            }
-          };
-
-      };
-
-
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
-    class TensorMultiplyAdd;
-
-
-    /*!
-     * Specialization for vector*matrix left multiplication.
-     *
-     * By default the x'*A operator for two matrices produces a vector-matrix
-     * multiplication.
-     *
-     * The left hand side vector is always treated as a row vector.
-     *
-     * The resulting vector type is inherited from the LHS
-     *
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
+    using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+    RightType right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size) {
+
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k;
+      auto left = et_left.eval(left_tile);
+
+      right_tile.m_begin[0] = k;
+      auto right = et_right.eval(right_tile);
+
+      // accumulate product
+      result = left.right_multiply_vector_accumulate(right, result);
+    }
+    // remainder tile in k
+    if (k < k_size) {
+      auto &left_part_tile = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k;
+      left_part_tile.m_size[1] = k_size - k;
+      auto left = et_left.eval(left_part_tile);
+
+      auto &right_part_tile = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k;
+      right_part_tile.m_size[0] = k_size - k;
+      auto right = et_right.eval(right_part_tile);
+
+      // accumulate product of partial tile
+      result = left.right_multiply_vector_accumulate(right, result);
+    }
+  }
+
+
+  template <typename T>
+  struct Diag {
+    static_assert(!std::is_same<T, void>::value, "diag");
+  };
+
+  template <typename I, TensorTileSize TTS, typename B, typename S>
+  struct Diag<StaticTensorTile<I, TTS, B, S>> {
+    static_assert(std::is_same<I, void>::value, "diag");
+  };
+
+  template <typename STORAGE, typename TILE_TYPE, typename INDEX>
+  struct MultiplyBridge {
+
+    Diag<TILE_TYPE> diag;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE &result,
+                                     TILE_TYPE const &tile,
+                                     LEFT_OPERAND_TYPE const &et_left,
+                                     RIGHT_OPERAND_TYPE const &et_right)
     {
+      // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
-      static constexpr camp::idx_t s_num_dims = 1;
+      // get tile size from matrix type
+      auto tile_size = left_type::result_type::s_dim_elem(1);
+      auto k_size = et_left.getDimSize(1);
+      // TODO: check that left and right are compatible
+      // m_left.getDimSize(1) == m_right.getDimSize(0)
+      // how do we provide checking for this kind of error?
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Vector*Matrix");
-      }
+      // tile over row of left and column of right
+      auto left_tile =
+          LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+      left_tile.m_begin[0] = tile.m_begin[0];
+      left_tile.m_size[0] = tile.m_size[0];
+      left_tile.m_size[1] = tile_size;
+
+      using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+      RightType right_tile = tile;
+      right_tile.m_size[0] = tile_size;
+
+      // Do full tiles in k
+      decltype(k_size) k = 0;
+      for (; k + tile_size <= k_size; k += tile_size) {
+
+        // evaluate both sides of operator
+        left_tile.m_begin[1] = k;
+        auto left = et_left.eval(left_tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return dim == 0 ? left.getDimSize(0) : 0;
+        right_tile.m_begin[0] = k;
+        auto right = et_right.eval(right_tile);
+
+        // accumulate product
+        result = left.right_multiply_vector_accumulate(right, result);
+      }
+      // remainder tile in k
+      if (k < k_size) {
+        auto &left_part_tile = make_tensor_tile_partial(left_tile);
+        left_part_tile.m_begin[1] = k;
+        left_part_tile.m_size[1] = k_size - k;
+        auto left = et_left.eval(left_part_tile);
+
+        auto &right_part_tile = make_tensor_tile_partial(right_tile);
+        right_part_tile.m_begin[0] = k;
+        right_part_tile.m_size[0] = k_size - k;
+        auto right = et_right.eval(right_part_tile);
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+    }
+  };
+
+
+  template <size_t INDEX,
+            typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      camp::integral_constant<size_t, INDEX>> {
+
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE &result,
+                                     TileType const &tile,
+                                     LEFT_OPERAND_TYPE const &et_left,
+                                     RIGHT_OPERAND_TYPE const &et_right)
+    {
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
-        // clear result
-        result_type result(0);
-
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
-
-        return result;
+      // get tile size from matrix type
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size = et_left.getDimSize(1);
+
+      auto const offset = INDEX * tile_size;
+
+      if ((offset + tile_size) <= k_size) {
+
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        // evaluate both sides of operator
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
+
+        auto right = et_right.eval(RightType());
+
+        // accumulate product
+        auto temp = left.right_multiply_vector_accumulate(right, result);
+        MultiplyBridge<STORAGE,
+                       TileType,
+                       camp::integral_constant<size_t, INDEX - 1>>::
+            multiply_into_result(result, tile, et_left, et_right);
+        result += temp;
+
+      } else {
+
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        auto right = et_right.eval(RightType());
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+    }
+  };
+
+
+  template <typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      camp::integral_constant<size_t, 0>> {
+
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE &result,
+                                     TileType const &,
+                                     LEFT_OPERAND_TYPE const &et_left,
+                                     RIGHT_OPERAND_TYPE const &et_right)
+    {
+
+      // get tile size from matrix type
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size = et_left.getDimSize(1);
+
+      auto const offset = 0;
+
+      if ((offset + tile_size) <= k_size) {
+
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        // evaluate both sides of operator
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
-        // evaluate add into result
-        result_type result = add.eval(tile);
+        auto right = et_right.eval(RightType());
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+        // accumulate product
+        auto temp = left.right_multiply_vector_accumulate(right, result);
+        result += temp;
 
-        return result;
+      } else {
+
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        auto right = et_right.eval(RightType());
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+    }
+  };
+
+  template <typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      void> {
+
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE &result,
+                                     TileType const &tile,
+                                     LEFT_OPERAND_TYPE const &et_left,
+                                     RIGHT_OPERAND_TYPE const &et_right)
+    {
 
-    private:
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-      {
-        // get tile size from matrix type
-        auto tile_size = right_type::result_type::s_dim_elem(0);
-        auto k_size = et_right.getDimSize(0);
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size = et_left.getDimSize(1);
+      const size_t iter_count =
+          (k_size / tile_size) + ((k_size % tile_size != 0) ? 1 : 0);
 
+      MultiplyBridge<STORAGE,
+                     TileType,
+                     camp::integral_constant<size_t, iter_count>>::
+          multiply_into_result(result, tile, et_left, et_right);
+    }
+  };
+};
 
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
 
-        // tile over row of left and column of right
-        auto right_tile = RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-        right_tile.m_begin[1] = tile.m_begin[0];
-        right_tile.m_size[1] = tile.m_size[0];
-        right_tile.m_size[0] = tile_size;
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd;
+
+
+/*!
+ * Specialization for vector*matrix left multiplication.
+ *
+ * By default the x'*A operator for two matrices produces a vector-matrix
+ * multiplication.
+ *
+ * The left hand side vector is always treated as a row vector.
+ *
+ * The resulting vector type is inherited from the LHS
+ *
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type> {
+
+  using left_type = LEFT_OPERAND_TYPE;
+  using right_type = RIGHT_OPERAND_TYPE;
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
+  static constexpr camp::idx_t s_num_dims = 1;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Vector*Matrix"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &left,
+                        RIGHT_OPERAND_TYPE const &)
+  {
+    return dim == 0 ? left.getDimSize(0) : 0;
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
+  {
+    // clear result
+    result_type result(0);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
 
-        TILE_TYPE left_tile = tile;
-        left_tile.m_size[0] = tile_size;
+    return result;
+  }
 
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right,
+      ADD_TYPE const &add)
+  {
+    // evaluate add into result
+    result_type result = add.eval(tile);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE &result,
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &et_left,
+      RIGHT_OPERAND_TYPE const &et_right)
+  {
+    // get tile size from matrix type
+    auto tile_size = right_type::result_type::s_dim_elem(0);
+    auto k_size = et_right.getDimSize(0);
 
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
 
-          // evaluate both sides of operator
-          right_tile.m_begin[0] = k;
-          auto right = et_right.eval(right_tile);
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
 
-          left_tile.m_begin[0] = k;
-          auto left = et_left.eval(left_tile);
+    // tile over row of left and column of right
+    auto right_tile =
+        RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+    right_tile.m_begin[1] = tile.m_begin[0];
+    right_tile.m_size[1] = tile.m_size[0];
+    right_tile.m_size[0] = tile_size;
 
-          // accumulate product
-          result = right.left_multiply_vector_accumulate(left, result);
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[0] = tile_size;
 
-        }
-        // remainder tile in k
-        if(k < k_size){
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
 
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[0] = k;
-          left_part_tile.m_size[0] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size) {
 
-          // compute product into x of partial tile
-          result = right.left_multiply_vector_accumulate(left, result);
-        }
+      // evaluate both sides of operator
+      right_tile.m_begin[0] = k;
+      auto right = et_right.eval(right_tile);
 
-      }
+      left_tile.m_begin[0] = k;
+      auto left = et_left.eval(left_tile);
 
-    };
+      // accumulate product
+      result = right.left_multiply_vector_accumulate(left, result);
+    }
+    // remainder tile in k
+    if (k < k_size) {
+      auto &right_part_tile = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k;
+      right_part_tile.m_size[0] = k_size - k;
+      auto right = et_right.eval(right_part_tile);
+
+      auto &left_part_tile = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[0] = k;
+      left_part_tile.m_size[0] = k_size - k;
+      auto left = et_left.eval(left_part_tile);
+
+      // compute product into x of partial tile
+      result = right.left_multiply_vector_accumulate(left, result);
+    }
+  }
+};
 
 
+/*!
+ * Specialization for matrix-matrix multiplication for TensorRegisters
+ *
+ * By default the A*B operator for two matrices produces a matrix-matrix
+ * multiplication.
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type> {
+
+  using left_type = LEFT_OPERAND_TYPE;
+  using right_type = RIGHT_OPERAND_TYPE;
+  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+  static constexpr camp::idx_t s_num_dims = 2;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Matrix"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &left,
+                        RIGHT_OPERAND_TYPE const &right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
+  {
 
-    /*!
-     * Specialization for matrix-matrix multiplication for TensorRegisters
+    /*
+     *
+     * For TensorRegister:
+     *
+     *   Return's a register containing product of left and right operands
+     *
+     * For TensorBlock:
+     *
+     *  Return's an ET TensorLiteral containing the left and right operrands
      *
-     * By default the A*B operator for two matrices produces a matrix-matrix
-     * multiplication.
+     *  OR
+     *
+     *  Returns an ET multiply
      *
      */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
-    {
+    // create zeroed temporary
+    result_type result;
+    result.broadcast(0);
+
+    // multiply left and right operands into temporary
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right,
+      ADD_TYPE const &add)
+  {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-      static constexpr camp::idx_t s_num_dims = 2;
+    // start accumulator with addition term
+    result_type result = add.eval(tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Matrx*Matrix");
-      }
+    multiply_into_result(result, tile, left, right);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-        return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-      }
+    return result;
+  }
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-
-        /*
-         *
-         * For TensorRegister:
-         *
-         *   Return's a register containing product of left and right operands
-         *
-         * For TensorBlock:
-         *
-         *  Return's an ET TensorLiteral containing the left and right operrands
-         *
-         *  OR
-         *
-         *  Returns an ET multiply
-         *
-         */
-        // create zeroed temporary
-        result_type result;
-        result.broadcast(0);
-
-        // multiply left and right operands into temporary
-        multiply_into_result(result, tile, left,right);
-
-        return result;
-      }
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE &result,
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &et_left,
+      RIGHT_OPERAND_TYPE const &et_right)
+  {
+    // get tile size from matrix type
+    using right_tensor_type = typename right_type::result_type;
+    auto tile_size = right_tensor_type::s_dim_elem(0);
+    auto k_size = et_left.getDimSize(1);
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add)
-      {
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
 
-        // start accumulator with addition term
-        result_type result = add.eval(tile);
+    // tile over row of left and column of right
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[1] = tile_size;
+    auto left_begin = et_left.getDimBegin(1);
 
-        multiply_into_result(result, tile, left, right);
+    TILE_TYPE right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+    auto right_begin = et_right.getDimBegin(0);
 
-        return result;
 
-      }
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size) {
 
-    private:
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-      {
-        // get tile size from matrix type
-        using right_tensor_type = typename right_type::result_type;
-        auto tile_size = right_tensor_type::s_dim_elem(0);
-        auto k_size = et_left.getDimSize(1);
-
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
-
-        // tile over row of left and column of right
-        TILE_TYPE left_tile = tile;
-        left_tile.m_size[1] = tile_size;
-        auto left_begin = et_left.getDimBegin(1);
-
-        TILE_TYPE right_tile = tile;
-        right_tile.m_size[0] = tile_size;
-        auto right_begin = et_right.getDimBegin(0);
-
-
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
-
-          // evaluate both sides of operator
-          left_tile.m_begin[1] = k + left_begin;
-          auto left = et_left.eval(left_tile);
-
-          right_tile.m_begin[0] = k + right_begin;
-          auto right = et_right.eval(right_tile);
-
-          // accumulate product
-          left.matrix_multiply_accumulate(result, right);
-        }
-        // remainder tile in k
-        if(k < k_size){
-
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[1] = k + left_begin;
-          left_part_tile.m_size[1] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
-
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k + right_begin;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
-
-          // accumulate product
-          left.matrix_multiply_accumulate(result, right);
-        }
-      }
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k + left_begin;
+      auto left = et_left.eval(left_tile);
 
-    };
+      right_tile.m_begin[0] = k + right_begin;
+      auto right = et_right.eval(right_tile);
 
+      // accumulate product
+      left.matrix_multiply_accumulate(result, right);
+    }
+    // remainder tile in k
+    if (k < k_size) {
 
+      auto &left_part_tile = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k + left_begin;
+      left_part_tile.m_size[1] = k_size - k;
+      auto left = et_left.eval(left_part_tile);
 
+      auto &right_part_tile = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k + right_begin;
+      right_part_tile.m_size[0] = k_size - k;
+      auto right = et_right.eval(right_part_tile);
+
+      // accumulate product
+      left.matrix_multiply_accumulate(result, right);
+    }
+  }
+};
 
 
-    template<typename OPERAND_TYPE, typename TILE_TYPE>
-    class RestrictExtents : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>> {
-      public:
-        using self_type = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
-        using operand_type = OPERAND_TYPE;
-        using result_type = typename OPERAND_TYPE::result_type;
-        using index_type = typename TILE_TYPE::index_type;
-        using tile_type = TILE_TYPE;
-        static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
+template <typename OPERAND_TYPE, typename TILE_TYPE>
+class RestrictExtents
+    : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>>
+{
+public:
+  using self_type = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
+  using operand_type = OPERAND_TYPE;
+  using result_type = typename OPERAND_TYPE::result_type;
+  using index_type = typename TILE_TYPE::index_type;
+  using tile_type = TILE_TYPE;
+  static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
+
+private:
+  operand_type m_operand;
+  tile_type m_tile;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RestrictExtents(operand_type const &operand, tile_type const &tile)
+      : m_operand{operand}, m_tile{tile}
+  {
+  }
 
-      private:
-        operand_type m_operand;
-        tile_type m_tile;
 
-      public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_tile.m_size[dim];
+  }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        RestrictExtents(operand_type const &operand, tile_type const &tile) :
-        m_operand{operand}, m_tile{tile}
-        {}
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimBegin(camp::idx_t dim) const
+  {
+    return m_tile.m_begin[dim];
+  }
 
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tile.m_size[dim];
-        }
+  template <typename TILE_TYPE2>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE2 const &tile) const
+      -> decltype(m_operand.eval(tile))
+  {
+    return m_operand.eval(tile);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("RestrictExtents(");
+    m_operand.print_ast();
+    printf(")");
+  }
+};
+
+template <typename OPERAND, typename TILE>
+RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const &operand,
+                                               TILE const &tile)
+{
+  using tile_type = typename OPERAND::tile_type;
+  tile_type new_tile;
+  new_tile.copy(tile);
+  return RestrictExtents<OPERAND, TILE>(operand, new_tile);
+}
+
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimBegin(camp::idx_t dim) const {
-          return m_tile.m_begin[dim];
-        }
+/*!
+ * Specialization for matrix-matrix multiplication for TensorBlocks
+ *
+ * By default the A*B operator for two matrices produces a matrix-matrix
+ * multiplication.
+ *
+ */
 
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<
+        std::is_base_of<TensorBlockConcreteBase,
+                        typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
+        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type> {
+  using left_type = LEFT_OPERAND_TYPE;
+  using right_type = RIGHT_OPERAND_TYPE;
+  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+  static constexpr camp::idx_t s_num_dims = 2;
 
-        template<typename TILE_TYPE2>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE2 const &tile) const ->
-          decltype(m_operand.eval(tile))
-        {
-          return m_operand.eval(tile);
-        }
+  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
+  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename
+  //      RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("RestrictExtents(");
-          m_operand.print_ast();
-          printf(")");
-        }
 
+  // This tensor type is a TensorBlock of some kind
+  using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
 
-    };
+  // Get the storage type from the TensorBlock
+  using storage_type = typename tensor_type::storage_type;
 
-    template<typename OPERAND, typename TILE>
-    RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const &operand, TILE const &tile){
-      using tile_type = typename OPERAND::tile_type;
-      tile_type new_tile;
-      new_tile.copy(tile);
-      return RestrictExtents<OPERAND, TILE>(operand, new_tile);
-    }
+  // Create a BlockLiteral that uses the TensorBlock's indicated storage
+  // and has an eval() that produces the TensorBlock's register type
+  using block_literal =
+      BlockLiteral<storage_type, typename tensor_type::register_type>;
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Matrix"); }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &left,
+                        RIGHT_OPERAND_TYPE const &right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &,
+      RIGHT_OPERAND_TYPE const &)  //->
+                                   /// decltype(TensorMultiply<decltype(left.eval(tile)),
+                                   /// decltype(right.eval(tile))>(left.eval(tile),
+                                   /// right.eval(tile)))
+  {
 
-    /*!
-     * Specialization for matrix-matrix multiplication for TensorBlocks
+    /*
+     * First pass:  just return a Multiply ET that evaluates the block
+     * with underlying TensorRegisters
+     *
+     *
+     * Second pass: we want to return a TensorLiteral ET node with the
+     * matrix product already evaluated.?
      *
-     * By default the A*B operator for two matrices produces a matrix-matrix
-     * multiplication.
+     * What we really care about is improving the data reuse: so perhaps
+     * returning a Multiply ET node with TensorLiteral nodes for each
+     * of the operands
+     *
+     */
+    // create a BlockLiteral
+    block_literal result(tile);
+
+    // evaluate the block-wise product into result
+
+    // return TensorMultiply<decltype(left.eval(tile)),
+    // decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
+
+    // return the BlockLiterat ET
+    return result;
+  }
+
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply_add(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right,
+      ADD_TYPE const &add)  //->
+                            // decltype(TensorMultiplyAdd<decltype(left.eval(tile)),
+                            // decltype(right.eval(tile)),
+                            // decltype(add.eval(tile))>(left.eval(tile),
+                            // right.eval(tile), add.eval(tile)))
+  {
+    /*
+     * First pass:  we want to return a BlockLiteral ET node with the
+     * matrix product already evaluated.  We do this by creating
+     * a LoadStore node wrapping the BlockLiteral, and evaluating it as
+     * a sub-expression.
+     *
+     * What we really care about is improving the data reuse: so perhaps
+     * returning a Multiply ET node with TensorLiteral nodes for each
+     * of the operands
      *
      */
 
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-    std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
-    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
-    {
-        using left_type = LEFT_OPERAND_TYPE;
-        using right_type = RIGHT_OPERAND_TYPE;
-        using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-        static constexpr camp::idx_t s_num_dims = 2;
+    // create a BlockLiteral
+    using block_tile_type = typename block_literal::tile_type;
+    block_tile_type block_tile;
+    block_tile.copy(tile);
+    block_literal result(block_tile);
+
+    using ref_type = typename block_literal::ref_type;
+    using load_store_type = TensorLoadStore<tensor_type, ref_type>;
+
+    // initialize the result with our addition term
+    auto result_et = load_store_type(result.get_ref()).eval(tile);
+    result_et = add.eval(tile);
+
+    // return TensorMultiplyAdd<decltype(left.eval(tile)),
+    // decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile),
+    // right.eval(tile), add.eval(tile));
+
+    //          multiply_into_result(result_et, tile, restrictExtents(left,
+    //          tile), restrictExtents(right, tile));
+    multiply_into_result(result_et, tile, left, right);
+
+    // return the BlockLiterat ET
+    return result;
+  }
+
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE &result,
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &et_left,
+      RIGHT_OPERAND_TYPE const &et_right)
+  {
 
-  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
-  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
-
-
-        // This tensor type is a TensorBlock of some kind
-        using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
-
-        // Get the storage type from the TensorBlock
-        using storage_type = typename tensor_type::storage_type;
-
-        // Create a BlockLiteral that uses the TensorBlock's indicated storage
-        // and has an eval() that produces the TensorBlock's register type
-        using block_literal = BlockLiteral<storage_type,
-                                           typename tensor_type::register_type>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Matrx*Matrix");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-        }
-
-        /*!
-         * Evaluate operands and perform element-wise multiply
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        block_literal multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &) //->
-          ///decltype(TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile)))
-        {
-
-          /*
-           * First pass:  just return a Multiply ET that evaluates the block
-           * with underlying TensorRegisters
-           *
-           *
-           * Second pass: we want to return a TensorLiteral ET node with the
-           * matrix product already evaluated.?
-           *
-           * What we really care about is improving the data reuse: so perhaps
-           * returning a Multiply ET node with TensorLiteral nodes for each
-           * of the operands
-           *
-           */
-          // create a BlockLiteral
-          block_literal result(tile);
-
-          // evaluate the block-wise product into result
-
-          //return TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
-
-          // return the BlockLiterat ET
-          return result;
-        }
-
-        template<typename TILE_TYPE, typename ADD_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        block_literal multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add) //->
-          //decltype(TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile)))
-        {
-          /*
-           * First pass:  we want to return a BlockLiteral ET node with the
-           * matrix product already evaluated.  We do this by creating
-           * a LoadStore node wrapping the BlockLiteral, and evaluating it as
-           * a sub-expression.
-           *
-           * What we really care about is improving the data reuse: so perhaps
-           * returning a Multiply ET node with TensorLiteral nodes for each
-           * of the operands
-           *
-           */
-
-          // create a BlockLiteral
-          using block_tile_type = typename block_literal::tile_type;
-          block_tile_type block_tile;
-          block_tile.copy(tile);
-          block_literal result(block_tile);
-
-          using ref_type = typename block_literal::ref_type;
-          using load_store_type = TensorLoadStore<tensor_type, ref_type>;
-
-          // initialize the result with our addition term
-          auto result_et = load_store_type(result.get_ref()).eval(tile);
-          result_et = add.eval(tile);
-
-          //return TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile));
-
-//          multiply_into_result(result_et, tile, restrictExtents(left, tile), restrictExtents(right, tile));
-          multiply_into_result(result_et, tile, left, right);
-
-          // return the BlockLiterat ET
-          return result;
-        }
-
-      private:
-
-        template<typename STORAGE, typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-        {
-
-          // get tile size from matrix type
-          auto tile_size = result_type::s_dim_elem(1);
-          auto k_size = et_left.getDimSize(1);
-
-          // TODO: check that left and right are compatible
-          // m_left.getDimSize(1) == m_right.getDimSize(0)
-          // how do we provide checking for this kind of error?
-
-          // tile over row of left and column of right
-          TILE_TYPE left_tile = tile;
-          left_tile.m_size[1] = tile_size;
-          auto left_begin = et_left.getDimBegin(1);
-
-          TILE_TYPE right_tile = tile;
-          right_tile.m_size[0] = tile_size;
-          auto right_begin = et_right.getDimBegin(0);
-
-
-
-          // Do full tiles in k
-          decltype(k_size) k = 0;
-          for(;k+tile_size <= k_size; k+= tile_size){
-
-
-            // evaluate both sides of operator
-            left_tile.m_begin[1] = k + left_begin;
-            auto left = et_left.eval(left_tile);
-
-            right_tile.m_begin[0] = k + right_begin;
-            auto right = et_right.eval(right_tile);
-
-            // accumulate product
-            //left.matrix_multiply_accumulate(result, right);
-            result += restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
-          }
-          // remainder tile in k
-          if(k < k_size){
-
-            auto &left_part_tile = make_tensor_tile_partial(left_tile);
-            left_part_tile.m_begin[1] = k + left_begin;
-            left_part_tile.m_size[1] = k_size-k;
-            auto left = et_left.eval(left_part_tile);
-
-            auto &right_part_tile = make_tensor_tile_partial(right_tile);
-            right_part_tile.m_begin[0] = k + right_begin;
-            right_part_tile.m_size[0] = k_size-k;
-            auto right = et_right.eval(right_part_tile);
-
-            // accumulate product
-            //left.matrix_multiply_accumulate(result, right);
-            result += restrictExtents(left, left_part_tile) * restrictExtents(right, right_part_tile);
-          }
-        }
-    };
-
-
-  } // namespace ET
+    // get tile size from matrix type
+    auto tile_size = result_type::s_dim_elem(1);
+    auto k_size = et_left.getDimSize(1);
+
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
+
+    // tile over row of left and column of right
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[1] = tile_size;
+    auto left_begin = et_left.getDimBegin(1);
+
+    TILE_TYPE right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+    auto right_begin = et_right.getDimBegin(0);
+
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size) {
+
+
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k + left_begin;
+      auto left = et_left.eval(left_tile);
+
+      right_tile.m_begin[0] = k + right_begin;
+      auto right = et_right.eval(right_tile);
+
+      // accumulate product
+      // left.matrix_multiply_accumulate(result, right);
+      result +=
+          restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
+    }
+    // remainder tile in k
+    if (k < k_size) {
+
+      auto &left_part_tile = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k + left_begin;
+      left_part_tile.m_size[1] = k_size - k;
+      auto left = et_left.eval(left_part_tile);
+
+      auto &right_part_tile = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k + right_begin;
+      right_part_tile.m_size[0] = k_size - k;
+      auto right = et_right.eval(right_part_tile);
+
+      // accumulate product
+      // left.matrix_multiply_accumulate(result, right);
+      result += restrictExtents(left, left_part_tile) *
+                restrictExtents(right, right_part_tile);
+    }
+  }
+};
+
+
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
index faa92747dd..5cedb6702a 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
@@ -19,10 +19,8 @@
 #define RAJA_pattern_tensor_ET_TensorDivide_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -33,346 +31,371 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          class ENABLE = void>
+struct DivideOperator;
+
+
+/*!
+ * Specialization that provides dividing a scalar by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type> {
+
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &,
+                        RIGHT_OPERAND_TYPE const &right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
+  {
+    result_type numerator(left.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL) {
+      return numerator.divide(right.eval(tile));
+    }
+
+    return numerator.divide_n(right.eval(tile), tile.m_size[0]);
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a vector by a scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type> {
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &left,
+                        RIGHT_OPERAND_TYPE const &)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
+  {
+    result_type denominator(right.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL) {
+      return left.eval(tile).divide(denominator);
+    } else {
+      return left.eval(tile).divide_n(denominator, tile.m_size[0]);
+    }
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a vector by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type> {
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &left,
+                        RIGHT_OPERAND_TYPE const &)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
+  {
+    if (tile.s_tensor_size == TENSOR_FULL) {
+      return left.eval(tile).divide(right.eval(tile));
+    } else {
+      return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
+    }
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a scalar by a matrix
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type> {
+
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &,
+                        RIGHT_OPERAND_TYPE const &right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
+  {
+    result_type numerator(left.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL) {
+      return numerator.divide(right.eval(tile));
+    }
+
+    return numerator.divide_nm(right.eval(tile),
+                               tile.m_size[0],
+                               tile.m_size[1]);
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a vector by a scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type> {
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &left,
+                        RIGHT_OPERAND_TYPE const &)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
   {
+    result_type denominator(right.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL) {
+      return left.eval(tile).divide(denominator);
+    } else {
+      return left.eval(tile).divide_nm(denominator,
+                                       tile.m_size[0],
+                                       tile.m_size[1]);
+    }
+  }
+};
+
 
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
-    struct DivideOperator;
-
-
-
-    /*!
-     * Specialization that provides dividing a scalar by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
-    {
-
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return right.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type numerator(left.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return numerator.divide(right.eval(tile));
-        }
-
-        return numerator.divide_n(right.eval(tile), tile.m_size[0]);
-
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
-    {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type denominator(right.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(denominator);
-        }
-        else{
-          return left.eval(tile).divide_n(denominator, tile.m_size[0]);
-        }
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
-    {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(right.eval(tile));
-        }
-        else{
-          return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
-        }
-      }
-    };
-
-
-
-
-
-
-    /*!
-     * Specialization that provides dividing a scalar by a matrix
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
-    {
-
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return right.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type numerator(left.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return numerator.divide(right.eval(tile));
-        }
-
-        return numerator.divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
-
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
-    {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type denominator(right.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(denominator);
-        }
-        else{
-          return left.eval(tile).divide_nm(denominator, tile.m_size[0], tile.m_size[1]);
-        }
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
-    {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(right.eval(tile));
-        }
-        else{
-          return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
-        }
-      }
-    };
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    class TensorDivide: public TensorExpressionBase<TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using divide_op = DivideOperator<left_operand_type, right_operand_type>;
-        using result_type = typename divide_op::result_type;
-        static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
-
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorDivide(left_operand_type const &left_operand, right_operand_type const &right_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const
-        {
-          return divide_op::divide(tile, m_left_operand, m_right_operand);
-        }
-
-        /*!
-         * Returns the LHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        left_operand_type const &getLeftOperand() const {
-          return m_left_operand;
-        }
-
-        /*!
-         * Returns the RHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        right_operand_type const &getRightOperand() const {
-          return m_right_operand;
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Divide(");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-    /*
-     * Overload for:    arithmetic / tensorexpression
-
-     */
-    template<typename LHS, typename RHS,
-      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator/(LHS const &left_operand, RHS const &right_operand) ->
-    TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
-    {
-      return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+/*!
+ * Specialization that provides dividing a vector by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type> {
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const &left,
+                        RIGHT_OPERAND_TYPE const &)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const &tile,
+      LEFT_OPERAND_TYPE const &left,
+      RIGHT_OPERAND_TYPE const &right)
+  {
+    if (tile.s_tensor_size == TENSOR_FULL) {
+      return left.eval(tile).divide(right.eval(tile));
+    } else {
+      return left.eval(tile).divide_nm(right.eval(tile),
+                                       tile.m_size[0],
+                                       tile.m_size[1]);
     }
+  }
+};
+
+
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+class TensorDivide : public TensorExpressionBase<
+                         TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
+{
+public:
+  using self_type = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+  using left_operand_type = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type = typename LEFT_OPERAND_TYPE::index_type;
+
+  using divide_op = DivideOperator<left_operand_type, right_operand_type>;
+  using result_type = typename divide_op::result_type;
+  static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
+
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorDivide(left_operand_type const &left_operand,
+               right_operand_type const &right_operand)
+      : m_left_operand{left_operand}, m_right_operand{right_operand}
+  {
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const &tile) const
+  {
+    return divide_op::divide(tile, m_left_operand, m_right_operand);
+  }
+
+  /*!
+   * Returns the LHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr left_operand_type const &getLeftOperand() const
+  {
+    return m_left_operand;
+  }
+
+  /*!
+   * Returns the RHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr right_operand_type const &getRightOperand() const
+  {
+    return m_right_operand;
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Divide(");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+/*
+ * Overload for:    arithmetic / tensorexpression
+
+ */
+template <
+    typename LHS,
+    typename RHS,
+    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator/(LHS const &left_operand,
+                                            RHS const &right_operand)
+    -> TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+{
+  return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
+      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+}
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
index 6720a304f2..01dfedda61 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
@@ -19,10 +19,8 @@
 #define RAJA_pattern_tensor_ET_TensorLiteral_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -33,76 +31,71 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename TENSOR_TYPE>
+class TensorLiteral : public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>>
+{
+public:
+  using self_type = TensorLiteral<TENSOR_TYPE>;
+  using tensor_type = TENSOR_TYPE;
+  using element_type = typename TENSOR_TYPE::element_type;
+  using result_type = tensor_type;
+  using index_type = RAJA::Index_type;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return tensor_type::s_dim_elem(dim);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit TensorLiteral(tensor_type const &value) : m_value{value} {}
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const &) const
+  {
+    return result_type(m_value);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("TensorLiteral()"); }
+
+private:
+  tensor_type m_value;
+};
+
+
+/*
+ * For TensorRegister nodes, we need to wrap this in a constant value ET node
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
+    typename std::enable_if<
+        std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type> {
+  using return_type = TensorLiteral<RHS>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const &rhs)
   {
+    return return_type(rhs);
+  }
+};
 
+}  // namespace ET
 
-    template<typename TENSOR_TYPE>
-    class TensorLiteral :  public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>> {
-      public:
-        using self_type = TensorLiteral<TENSOR_TYPE>;
-        using tensor_type = TENSOR_TYPE;
-        using element_type = typename TENSOR_TYPE::element_type;
-        using result_type = tensor_type;
-        using index_type = RAJA::Index_type;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return tensor_type::s_dim_elem(dim);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        TensorLiteral(tensor_type const &value) :
-        m_value{value}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &) const {
-          return result_type(m_value);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("TensorLiteral()");
-        }
-
-      private:
-        tensor_type m_value;
-    };
-
-
-    /*
-     * For TensorRegister nodes, we need to wrap this in a constant value ET node
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
-    typename std::enable_if<std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type>
-    {
-        using return_type = TensorLiteral<RHS>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return return_type(rhs);
-        }
-    };
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
index 3b69552a32..7e28933d64 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
@@ -19,11 +19,9 @@
 #define RAJA_pattern_tensor_ET_TensorLoadStore_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 #include "RAJA/pattern/tensor/internal/TensorTileExec.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -34,220 +32,185 @@ namespace expt
 {
 
 
+namespace ET
+{
 
 
+template <typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+struct TensorStoreFunctor {
+  LHS_TYPE const &m_lhs;
+  RHS_TYPE const &m_rhs;
 
-  namespace ET
+  template <typename TILE_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(TILE_TYPE const &tile) const
   {
 
 
+    /*
+     *
+     * For recursive ET types, eval() produces a new ET, and
+     * eval_lhs() produces a new TensorLoadStore.
+     *
+     */
 
-    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-    struct TensorStoreFunctor
-    {
-        LHS_TYPE const &m_lhs;
-        RHS_TYPE const &m_rhs;
-
-        template<typename TILE_TYPE>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void operator()(TILE_TYPE const &tile) const {
-
-
-          /*
-           *
-           * For recursive ET types, eval() produces a new ET, and
-           * eval_lhs() produces a new TensorLoadStore.
-           *
-           */
-
-          m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
-
-        }
-    };
-
-    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    auto makeTensorStoreFunctor(LHS_TYPE const &lhs, RHS_TYPE const &rhs) ->
-    TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
-    {
-      return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>{lhs, rhs};
-    }
-
-
-    template<typename TENSOR_TYPE, typename REF_TYPE>
-    class TensorLoadStore : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>> {
-      public:
-        using self_type = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
-        using tensor_type = TENSOR_TYPE;
-        using element_type = typename TENSOR_TYPE::element_type;
-        using index_type = typename REF_TYPE::index_type;
-        using ref_type = REF_TYPE;
-        using tile_type = typename REF_TYPE::tile_type;
-        using result_type = TENSOR_TYPE;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-
-      private:
-        ref_type m_ref;
-
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        TensorLoadStore(ref_type const &ref) : m_ref{ref}
-        {
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorLoadStore(self_type const &rhs) : m_ref(rhs.m_ref)
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print() const {
-          printf("TensorLoadStore: ");
-          m_ref.m_tile.print();
-        }
-
-//        RAJA_SUPPRESS_HD_WARN
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator=(self_type const &rhs)
-        {
-          store(rhs);
-          return *this;
-        }
-
-//        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator=(RHS const &rhs)
-        {
-
-          store(normalizeOperand(rhs));
-
-          return *this;
-        }
-
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator+=(RHS const &rhs)
-        {
-          store( normalizeOperand(rhs) + (*this) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator-=(RHS const &rhs)
-        {
-          store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator*=(RHS const &rhs)
-        {
-          store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator/=(RHS const &rhs)
-        {
-          store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
-        {
-          return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval_lhs(TILE_TYPE const &tile) const ->
-          decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref, tile)))
-        {
-          return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_ref.m_tile.m_size[dim];
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Load()");
-        }
-
-      private:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        tile_type const &getTile() const {
-          return m_ref.m_tile;
-        }
-
-
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void store(RHS const &rhs)
-        {
-#ifdef RAJA_DEBUG_PRINT_ET_AST
-          printf("Store(");
-          rhs.print_ast();
-          printf(")\n");
-#endif
+    m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
+  }
+};
+
+template <typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto makeTensorStoreFunctor(
+    LHS_TYPE const &lhs,
+    RHS_TYPE const &rhs) -> TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
+{
+  return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>{lhs, rhs};
+}
+
+
+template <typename TENSOR_TYPE, typename REF_TYPE>
+class TensorLoadStore
+    : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>>
+{
+public:
+  using self_type = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
+  using tensor_type = TENSOR_TYPE;
+  using element_type = typename TENSOR_TYPE::element_type;
+  using index_type = typename REF_TYPE::index_type;
+  using ref_type = REF_TYPE;
+  using tile_type = typename REF_TYPE::tile_type;
+  using result_type = TENSOR_TYPE;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+
+private:
+  ref_type m_ref;
+
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit TensorLoadStore(ref_type const &ref) : m_ref{ref} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorLoadStore(self_type const &rhs) : m_ref(rhs.m_ref) {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print() const
+  {
+    printf("TensorLoadStore: ");
+    m_ref.m_tile.print();
+  }
+
+  //        RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator=(self_type const &rhs)
+  {
+    store(rhs);
+    return *this;
+  }
+
+  //        RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type &operator=(RHS const &rhs)
+  {
 
-          tensorTileExec<tensor_type>(m_ref.m_tile,
-              makeTensorStoreFunctor<tensor_type>(*this, rhs));
-        }
+    store(normalizeOperand(rhs));
 
+    return *this;
+  }
 
 
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type &operator+=(RHS const &rhs)
+  {
+    store(normalizeOperand(rhs) + (*this));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type &operator-=(RHS const &rhs)
+  {
+    store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
 
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*=(RHS const &rhs)
+  {
+    store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator/=(RHS const &rhs)
+  {
+    store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const &tile) const
+      -> decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
+  {
+    return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval_lhs(TILE_TYPE const &tile) const
+      -> decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref,
+                                                                  tile)))
+  {
+    return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_ref.m_tile.m_size[dim];
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("Load()"); }
+
+private:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  tile_type const &getTile() const { return m_ref.m_tile; }
+
+
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE void store(RHS const &rhs)
+  {
+#ifdef RAJA_DEBUG_PRINT_ET_AST
+    printf("Store(");
+    rhs.print_ast();
+    printf(")\n");
+#endif
 
-    };
+    tensorTileExec<tensor_type>(m_ref.m_tile,
+                                makeTensorStoreFunctor<tensor_type>(*this,
+                                                                    rhs));
+  }
+};
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
index 3e3429588f..52ed6e3fad 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
@@ -19,11 +19,9 @@
 #define RAJA_pattern_tensor_ET_TensorMultiply_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 #include "RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -33,127 +31,139 @@ namespace internal
 namespace expt
 {
 
-  namespace ET
+namespace ET
+{
+
+// forward decl for FMA contraction
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_TYPE>
+class TensorMultiplyAdd;
+
+
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+class TensorMultiply
+    : public TensorExpressionBase<
+          TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
+{
+public:
+  using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+  using left_operand_type = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type = typename LEFT_OPERAND_TYPE::index_type;
+
+  using result_type = typename multiply_op::result_type;
+  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorMultiply(left_operand_type const &left_operand,
+                 right_operand_type const &right_operand)
+      : m_left_operand{left_operand}, m_right_operand{right_operand}
+  {
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr int getDimSize(int dim) const
   {
+    return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const &tile) const
+      -> decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
+  {
+    return multiply_op::multiply(tile, m_left_operand, m_right_operand);
+  }
+
+  /*!
+   * Returns the LHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr left_operand_type const &getLeftOperand() const
+  {
+    return m_left_operand;
+  }
+
+  /*!
+   * Returns the RHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr right_operand_type const &getRightOperand() const
+  {
+    return m_right_operand;
+  }
+
+
+  /*!
+   * operator+ overload that forms a FMA contraction
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename ADD>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorMultiplyAdd<left_operand_type,
+                                                 right_operand_type,
+                                                 normalize_operand_t<ADD>>
+  operator+(ADD const &add) const
+  {
+    return TensorMultiplyAdd<left_operand_type,
+                             right_operand_type,
+                             normalize_operand_t<ADD>>(m_left_operand,
+                                                       m_right_operand,
+                                                       normalizeOperand(add));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Multiply[");
+    multiply_op::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+/*
+ * Overload for:    arithmetic * tensorexpression
+
+ */
+template <
+    typename LHS,
+    typename RHS,
+    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const &left_operand,
+                                            RHS const &right_operand)
+    -> TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+{
+  return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
+      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+}
+
+}  // namespace ET
 
-    // forward decl for FMA contraction
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_TYPE>
-    class TensorMultiplyAdd;
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    class TensorMultiply : public TensorExpressionBase<TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using result_type = typename multiply_op::result_type;
-        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiply(left_operand_type const &left_operand, right_operand_type const &right_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        int getDimSize(int dim) const {
-          return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
-        {
-          return multiply_op::multiply(tile, m_left_operand, m_right_operand);
-        }
-
-        /*!
-         * Returns the LHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        left_operand_type const &getLeftOperand() const {
-          return m_left_operand;
-        }
-
-        /*!
-         * Returns the RHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        right_operand_type const &getRightOperand() const {
-          return m_right_operand;
-        }
-
-
-        /*!
-         * operator+ overload that forms a FMA contraction
-         */
-        RAJA_SUPPRESS_HD_WARN
-        template<typename ADD>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>
-        operator+(ADD const &add) const {
-          return TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>(m_left_operand, m_right_operand, normalizeOperand(add));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Multiply[");
-          multiply_op::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-    /*
-     * Overload for:    arithmetic * tensorexpression
-
-     */
-    template<typename LHS, typename RHS,
-      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator*(LHS const &left_operand, RHS const &right_operand) ->
-    TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
-    {
-      return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
-    }
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
index 44f27e92c7..88cc26bdb3 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
@@ -19,11 +19,9 @@
 #define RAJA_pattern_tensor_ET_TensorMultiplyAddAdd_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 #include "RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp"
+#include "RAJA/util/macros.hpp"
 
 namespace RAJA
 {
@@ -33,81 +31,93 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-
-    /*!
-     * Expression for LHS*RHS+ADD, which allows for accessing FMA style
-     * operations.
-     *
-     * This ET can only be generated by contracting an Add and Multiple ET.
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
-    class TensorMultiplyAdd : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using add_operand_type = ADD_OPERAND_TYPE;
-        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using result_type = typename multiply_op::result_type;
-        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-        add_operand_type m_add_operand;
-
-      public:
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiplyAdd(left_operand_type const &left_operand, right_operand_type const &right_operand,
-                          add_operand_type const &add_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}, m_add_operand{add_operand}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand))
-        {
-          return multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand);
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("MultiplyAdd[");
-          multiply_op::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(", ");
-          m_add_operand.print_ast();
-          printf(")");
-        }
-
-
+namespace ET
+{
 
-    };
 
+/*!
+ * Expression for LHS*RHS+ADD, which allows for accessing FMA style
+ * operations.
+ *
+ * This ET can only be generated by contracting an Add and Multiple ET.
+ *
+ */
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd
+    : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                                    RIGHT_OPERAND_TYPE,
+                                                    ADD_OPERAND_TYPE>>
+{
+public:
+  using self_type = TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                      RIGHT_OPERAND_TYPE,
+                                      ADD_OPERAND_TYPE>;
+  using left_operand_type = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using add_operand_type = ADD_OPERAND_TYPE;
+  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type = typename LEFT_OPERAND_TYPE::index_type;
+
+  using result_type = typename multiply_op::result_type;
+  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+  add_operand_type m_add_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorMultiplyAdd(left_operand_type const &left_operand,
+                    right_operand_type const &right_operand,
+                    add_operand_type const &add_operand)
+      : m_left_operand{left_operand},
+        m_right_operand{right_operand},
+        m_add_operand{add_operand}
+  {
+  }
 
 
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const &tile) const
+      -> decltype(multiply_op::multiply_add(tile,
+                                            m_left_operand,
+                                            m_right_operand,
+                                            m_add_operand))
+  {
+    return multiply_op::multiply_add(tile,
+                                     m_left_operand,
+                                     m_right_operand,
+                                     m_add_operand);
+  }
 
-  } // namespace ET
 
-  } // namespace internal
-} // namespace expt
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("MultiplyAdd[");
+    multiply_op::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(", ");
+    m_add_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+}  // namespace ET
+
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
index d5211e4963..703876e3e8 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
@@ -19,10 +19,8 @@
 #define RAJA_pattern_tensor_ET_TensorNegate_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -33,61 +31,58 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename ET_TYPE>
+class TensorNegate : public TensorExpressionBase<TensorNegate<ET_TYPE>>
+{
+public:
+  using self_type = TensorNegate<ET_TYPE>;
+  using rhs_type = ET_TYPE;
+  using tensor_type = typename ET_TYPE::result_type;
+  using element_type = typename tensor_type::element_type;
+  using index_type = typename ET_TYPE::index_type;
+
+  using result_type = tensor_type;
+  using tile_type = typename ET_TYPE::tile_type;
+  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorNegate(rhs_type const &tensor) : m_tensor{tensor} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_tensor.getDimSize(dim);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const &tile) const
+  {
+    return m_tensor.eval(tile).scale(-1);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
   {
+    printf("Negate(");
+    m_tensor.print_ast();
+    printf(")");
+  }
+
+private:
+  rhs_type m_tensor;
+};
+
+
+}  // namespace ET
 
-    template<typename ET_TYPE>
-    class TensorNegate :  public TensorExpressionBase<TensorNegate<ET_TYPE>> {
-      public:
-        using self_type = TensorNegate<ET_TYPE>;
-        using rhs_type = ET_TYPE;
-        using tensor_type = typename ET_TYPE::result_type;
-        using element_type = typename tensor_type::element_type;
-        using index_type = typename ET_TYPE::index_type;
-
-        using result_type = tensor_type;
-        using tile_type = typename ET_TYPE::tile_type;
-        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorNegate(rhs_type const &tensor) :
-        m_tensor{tensor}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tensor.getDimSize(dim);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const
-        {
-          return m_tensor.eval(tile).scale(-1);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Negate(");
-          m_tensor.print_ast();
-          printf(")");
-        }
-
-      private:
-        rhs_type m_tensor;
-    };
-
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
index 4ab0a3ebc6..123d3bf020 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
@@ -19,10 +19,8 @@
 #define RAJA_pattern_tensor_ET_ScalarLiteral_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -33,78 +31,71 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename T>
+class TensorScalarLiteral : public TensorExpressionBase<TensorScalarLiteral<T>>
+{
+public:
+  using self_type = TensorScalarLiteral<T>;
+  using tensor_type = RAJA::expt::ScalarRegister<T>;
+  using element_type = T;
+  using result_type = T;
+  using index_type = RAJA::Index_type;
+
+  static constexpr camp::idx_t s_num_dims = 0;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type) const { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit constexpr TensorScalarLiteral(element_type const &value) noexcept
+      : m_value{value}
+  {
+  }
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE element_type eval(TILE_TYPE const &) const
   {
+    return m_value;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("ScalarLiteral(%e)", (double)m_value); }
+
+private:
+  element_type m_value;
+};
+
+
+/*
+ * For arithmetic values, we need to wrap in a constant value ET node
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
+    typename std::enable_if<std::is_arithmetic<RHS>::value>::type> {
+  using return_type = TensorScalarLiteral<RHS>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const &rhs)
+  {
+    return return_type(rhs);
+  }
+};
+
 
+}  // namespace ET
 
-    template<typename T>
-    class TensorScalarLiteral :  public TensorExpressionBase<TensorScalarLiteral<T>> {
-      public:
-        using self_type = TensorScalarLiteral<T>;
-        using tensor_type = RAJA::expt::ScalarRegister<T>;
-        using element_type = T;
-        using result_type = T;
-        using index_type = RAJA::Index_type;
-
-        static constexpr camp::idx_t s_num_dims = 0;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type ) const {
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        constexpr
-        TensorScalarLiteral(element_type const &value) noexcept :
-        m_value{value}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        element_type eval(TILE_TYPE const &) const {
-          return m_value;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("ScalarLiteral(%e)", (double)m_value);
-        }
-
-      private:
-        element_type m_value;
-    };
-
-
-    /*
-     * For arithmetic values, we need to wrap in a constant value ET node
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
-    typename std::enable_if<std::is_arithmetic<RHS>::value>::type>
-    {
-        using return_type = TensorScalarLiteral<RHS>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return return_type(rhs);
-        }
-    };
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
index 46950eec6f..24042d65c3 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
@@ -19,10 +19,8 @@
 #define RAJA_pattern_tensor_ET_TensorTranspose_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -33,67 +31,63 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename ET_TYPE>
+class TensorTranspose : public TensorExpressionBase<TensorTranspose<ET_TYPE>>
+{
+public:
+  using self_type = TensorTranspose<ET_TYPE>;
+  using rhs_type = ET_TYPE;
+  using tensor_type = typename ET_TYPE::result_type;
+  using element_type = typename tensor_type::element_type;
+  using index_type = typename ET_TYPE::index_type;
+
+  using result_type = tensor_type;
+  using tile_type = typename ET_TYPE::tile_type;
+  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorTranspose(rhs_type const &tensor) : m_tensor{tensor} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
   {
+    return m_tensor.getDimSize(dim);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const &tile) const
+  {
+    // transpose which tile we are returning
+    TILE_TYPE trans_tile{{tile.m_begin[1], tile.m_begin[0]},
+                         {tile.m_size[1], tile.m_size[0]}};
+
+    // evaluate and return the transposed tile
+    return m_tensor.eval(trans_tile).transpose();
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Transpose(");
+    m_tensor.print_ast();
+    printf(")");
+  }
+
+private:
+  rhs_type m_tensor;
+};
+
+
+}  // namespace ET
 
-    template<typename ET_TYPE>
-    class TensorTranspose :  public TensorExpressionBase<TensorTranspose<ET_TYPE>> {
-      public:
-        using self_type = TensorTranspose<ET_TYPE>;
-        using rhs_type = ET_TYPE;
-        using tensor_type = typename ET_TYPE::result_type;
-        using element_type = typename tensor_type::element_type;
-        using index_type = typename ET_TYPE::index_type;
-
-        using result_type = tensor_type;
-        using tile_type = typename ET_TYPE::tile_type;
-        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorTranspose(rhs_type const &tensor) :
-        m_tensor{tensor}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tensor.getDimSize(dim);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const {
-          // transpose which tile we are returning
-          TILE_TYPE trans_tile{
-            {tile.m_begin[1], tile.m_begin[0]},
-            {tile.m_size[1],  tile.m_size[0]}
-          };
-
-          // evaluate and return the transposed tile
-          return m_tensor.eval(trans_tile).transpose();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Transpose(");
-          m_tensor.print_ast();
-          printf(")");
-        }
-
-      private:
-        rhs_type m_tensor;
-    };
-
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
index 2a868a3131..7acc26177f 100644
--- a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
@@ -19,10 +19,8 @@
 #define RAJA_pattern_tensor_ET_normalizeOperand_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
+#include "RAJA/util/macros.hpp"
 
 
 namespace RAJA
@@ -33,64 +31,56 @@ namespace expt
 {
 
 
-    class TensorRegisterConcreteBase;
-
-  namespace ET
-  {
-    class TensorExpressionConcreteBase;
-
-    template<typename RHS, typename enable = void>
-    struct NormalizeOperandHelper;
+class TensorRegisterConcreteBase;
 
+namespace ET
+{
+class TensorExpressionConcreteBase;
 
-    /*
-     * For TensorExpression nodes, we just return them as-is.
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
-    typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type>
-    {
-        using return_type = RHS;
+template <typename RHS, typename enable = void>
+struct NormalizeOperandHelper;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return rhs;
-        }
-    };
 
+/*
+ * For TensorExpression nodes, we just return them as-is.
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type> {
+  using return_type = RHS;
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const &rhs) { return rhs; }
+};
 
 
-    /**
-     * Allows uniform packaging up of operands into ExpressionTemplates.
-     *
-     * The NormalizeOperandHelper is specialized throughout the code in order
-     * to convert non-ET operands into ET objects
-     *
-     * ET operators can then take any operand type, and use this to convert
-     * them into ET types the same way.
-     */
-    template<typename RHS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto normalizeOperand(RHS const &rhs) ->
+/**
+ * Allows uniform packaging up of operands into ExpressionTemplates.
+ *
+ * The NormalizeOperandHelper is specialized throughout the code in order
+ * to convert non-ET operands into ET objects
+ *
+ * ET operators can then take any operand type, and use this to convert
+ * them into ET types the same way.
+ */
+template <typename RHS>
+RAJA_INLINE RAJA_HOST_DEVICE auto normalizeOperand(RHS const &rhs) ->
     typename NormalizeOperandHelper<RHS>::return_type
-    {
-      return NormalizeOperandHelper<RHS>::normalize(rhs);
-    }
+{
+  return NormalizeOperandHelper<RHS>::normalize(rhs);
+}
 
-    template<typename RHS>
-    using normalize_operand_t =
-        typename NormalizeOperandHelper<RHS>::return_type;
+template <typename RHS>
+using normalize_operand_t = typename NormalizeOperandHelper<RHS>::return_type;
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
index 2b6bf7304d..aab51e996b 100644
--- a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
@@ -18,9 +18,9 @@
 #ifndef RAJA_pattern_tensor_expression_template_HPP
 #define RAJA_pattern_tensor_expression_template_HPP
 
-#include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 #include "RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp"
 #include "RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp"
+#include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 #include "RAJA/pattern/tensor/internal/ET/TensorDivide.hpp"
 #include "RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp"
 #include "RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp"
@@ -31,5 +31,4 @@
 #include "RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp"
 
 
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
index 08a9886acc..e0f8be621b 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
@@ -18,9 +18,9 @@
 #ifndef RAJA_pattern_tensor_internal_MatrixMatrixMultiply_HPP
 #define RAJA_pattern_tensor_internal_MatrixMatrixMultiply_HPP
 
-#include "camp/camp.hpp"
 #include "RAJA/config.hpp"
 #include "RAJA/pattern/tensor/MatrixRegister.hpp"
+#include "camp/camp.hpp"
 
 
 namespace RAJA
@@ -31,308 +31,315 @@ namespace expt
 {
 
 
+template <typename MATA, typename MATB>
+struct MatrixMatrixMultiplyHelper;
 
 
-
-
-
-  template<typename MATA, typename MATB>
-  struct MatrixMatrixMultiplyHelper;
-
-
-
-  /**
-   *
-   * Row-Major * Row-Major ==> Row-Major
+/**
+ *
+ * Row-Major * Row-Major ==> Row-Major
+ *
+ */
+template <typename T,
+          typename REGISTER_POLICY,
+          camp::idx_t N_SIZE,
+          camp::idx_t M_SIZE,
+          camp::idx_t M2_SIZE,
+          camp::idx_t O_SIZE>
+struct MatrixMatrixMultiplyHelper<
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>> {
+
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::RowMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::RowMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::RowMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+
+  using register_type = typename result_type::register_type;
+
+  static constexpr camp::idx_t s_elements_per_register =
+      left_type::s_elements_per_register;
+  static constexpr camp::idx_t s_A_minor_dim_registers =
+      left_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_B_minor_dim_registers =
+      right_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_C_minor_dim_registers =
+      result_type::s_minor_dim_registers;
+
+  /*
+   * Matrix B (and C) has 1 more more registers per row
    *
    */
-  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
-  struct MatrixMatrixMultiplyHelper<
-  RAJA::expt::TensorRegister<REGISTER_POLICY,
-                   T,
-                   RAJA::expt::RowMajorLayout,
-                   camp::idx_seq<N_SIZE, M_SIZE>>,
-                   RAJA::expt::TensorRegister<REGISTER_POLICY,
-                    T,
-                    RAJA::expt::RowMajorLayout,
-                    camp::idx_seq<M2_SIZE, O_SIZE>> >
-    {
-
-      static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
-
-      using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                       T,
-                                       RAJA::expt::RowMajorLayout,
-                                       camp::idx_seq<N_SIZE, M_SIZE>>;
-
-      using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        RAJA::expt::RowMajorLayout,
-                                        camp::idx_seq<M_SIZE, O_SIZE>> ;
-
-      using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                         T,
-                                         RAJA::expt::RowMajorLayout,
-                                         camp::idx_seq<N_SIZE, O_SIZE>> ;
-
-      using register_type = typename result_type::register_type;
-
-      static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
-      static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
-      static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
-      static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
-
-      /*
-       * Matrix B (and C) has 1 more more registers per row
-       *
-       */
-      template<typename dummy = void>
-      RAJA_HOST_DEVICE
-      static
-      RAJA_INLINE
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE static RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-      {
+      multiply_accumulate(left_type const &A,
+                          right_type const &B,
+                          result_type &C)
+  {
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-        RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
+    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
 #endif
 
-        constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
+    constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
 
-        RAJA_UNROLL
-        for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
-          camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
-          camp::idx_t ac_row = c_reg / num_bc_reg_per_row;
-
-          RAJA_UNROLL
-          for(camp::idx_t a_col = 0;a_col < M_SIZE;++ a_col){
-            camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
-
-            C.get_register(c_reg) =
-                register_type(A.get(ac_row, a_col)).multiply_add(
-                    B.get_register(b_reg),
-                    C.get_register(c_reg));
-          }
-        }
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg) {
+      camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
+      camp::idx_t ac_row = c_reg / num_bc_reg_per_row;
+
+      RAJA_UNROLL
+      for (camp::idx_t a_col = 0; a_col < M_SIZE; ++a_col) {
+        camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
 
+        C.get_register(c_reg) =
+            register_type(A.get(ac_row, a_col))
+                .multiply_add(B.get_register(b_reg), C.get_register(c_reg));
       }
+    }
+  }
 
-      /*
-       * Matrix B (and C) have less than one register per row
-       *
-       */
-      template<typename dummy = void>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
+  /*
+   * Matrix B (and C) have less than one register per row
+   *
+   */
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE RAJA_INLINE static
       typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-      {
-        constexpr camp::idx_t bc_segbits = result_type::s_segbits;
-        constexpr camp::idx_t a_segments_per_register = 1<<bc_segbits;
-
-        RAJA_UNROLL
-        for(camp::idx_t ac_row = 0;ac_row < N_SIZE;++ ac_row){
-          camp::idx_t c_reg     = ac_row / result_type::s_major_dim_per_register;
-          camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
-          register_type c_tmp;
-
-          RAJA_UNROLL
-          for(camp::idx_t b_reg = 0;b_reg < right_type::s_num_registers;++ b_reg){
-
-            camp::idx_t a_segment = ac_row*right_type::s_num_registers + b_reg;
-            camp::idx_t a_reg = a_segment / a_segments_per_register;
-            camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
-
-            auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(bc_segbits, a_reg_segment);
-
-            if(b_reg == 0){
-
-              c_tmp = a_tmp.multiply(B.get_register(b_reg));
-            }
-            else{
-              c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
-            }
-
-          }
-
-          C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
-
+      multiply_accumulate(left_type const &A,
+                          right_type const &B,
+                          result_type &C)
+  {
+    constexpr camp::idx_t bc_segbits = result_type::s_segbits;
+    constexpr camp::idx_t a_segments_per_register = 1 << bc_segbits;
+
+    RAJA_UNROLL
+    for (camp::idx_t ac_row = 0; ac_row < N_SIZE; ++ac_row) {
+      camp::idx_t c_reg = ac_row / result_type::s_major_dim_per_register;
+      camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
+      register_type c_tmp;
+
+      RAJA_UNROLL
+      for (camp::idx_t b_reg = 0; b_reg < right_type::s_num_registers;
+           ++b_reg) {
+
+        camp::idx_t a_segment = ac_row * right_type::s_num_registers + b_reg;
+        camp::idx_t a_reg = a_segment / a_segments_per_register;
+        camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
+
+        auto a_tmp =
+            A.get_register(a_reg).segmented_broadcast_outer(bc_segbits,
+                                                            a_reg_segment);
+
+        if (b_reg == 0) {
+
+          c_tmp = a_tmp.multiply(B.get_register(b_reg));
+        } else {
+          c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
         }
-
       }
 
-      RAJA_HOST_DEVICE
-      static
-      RAJA_INLINE
-      void multiply(left_type const &A, right_type const &B, result_type &C){
-        C = result_type(0);
-        multiply_accumulate(A, B, C);
-      }
-  };
+      C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
+    }
+  }
 
+  RAJA_HOST_DEVICE
+  static RAJA_INLINE void multiply(left_type const &A,
+                                   right_type const &B,
+                                   result_type &C)
+  {
+    C = result_type(0);
+    multiply_accumulate(A, B, C);
+  }
+};
 
-  /**
-   *
-   * Column-Major * Column-Major ==> Column-Major
+
+/**
+ *
+ * Column-Major * Column-Major ==> Column-Major
+ *
+ */
+template <typename T,
+          typename REGISTER_POLICY,
+          camp::idx_t N_SIZE,
+          camp::idx_t M_SIZE,
+          camp::idx_t M2_SIZE,
+          camp::idx_t O_SIZE>
+struct MatrixMatrixMultiplyHelper<
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>> {
+
+  using self_type = MatrixMatrixMultiplyHelper<
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<N_SIZE, M_SIZE>>,
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<M2_SIZE, O_SIZE>>>;
+
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::ColMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::ColMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::ColMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+
+  using register_type = typename result_type::register_type;
+
+  static constexpr camp::idx_t s_elements_per_register =
+      left_type::s_elements_per_register;
+  static constexpr camp::idx_t s_A_minor_dim_registers =
+      left_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_B_minor_dim_registers =
+      right_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_C_minor_dim_registers =
+      result_type::s_minor_dim_registers;
+
+
+  /*
+   * Matrix A (and C) has 1 more more registers per column
    *
    */
-  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
-    struct MatrixMatrixMultiplyHelper<
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                     T,
-                     RAJA::expt::ColMajorLayout,
-                     camp::idx_seq<N_SIZE, M_SIZE>>,
-                     RAJA::expt::TensorRegister<REGISTER_POLICY,
-                      T,
-                      RAJA::expt::ColMajorLayout,
-                      camp::idx_seq<M2_SIZE, O_SIZE>> >
-      {
-
-      using self_type = MatrixMatrixMultiplyHelper<
-          RAJA::expt::TensorRegister<REGISTER_POLICY,
-                         T,
-                         RAJA::expt::ColMajorLayout,
-                         camp::idx_seq<N_SIZE, M_SIZE>>,
-                         RAJA::expt::TensorRegister<REGISTER_POLICY,
-                          T,
-                          RAJA::expt::ColMajorLayout,
-                          camp::idx_seq<M2_SIZE, O_SIZE>> >;
-
-        static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
-
-        using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                         T,
-                                         RAJA::expt::ColMajorLayout,
-                                         camp::idx_seq<N_SIZE, M_SIZE>>;
-
-        using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                          T,
-                                          RAJA::expt::ColMajorLayout,
-                                          camp::idx_seq<M_SIZE, O_SIZE>> ;
-
-        using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                           T,
-                                           RAJA::expt::ColMajorLayout,
-                                           camp::idx_seq<N_SIZE, O_SIZE>> ;
-
-        using register_type = typename result_type::register_type;
-
-        static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
-        static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
-        static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
-        static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
-
-
-
-        /*
-         * Matrix A (and C) has 1 more more registers per column
-         *
-         */
-        template<typename dummy = void>
-        RAJA_HOST_DEVICE
-        static
-        RAJA_INLINE
-        typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-        {
-
-  #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-          RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
-  #endif
-
-
-          constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
-
-          RAJA_UNROLL
-          for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
-            camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
-            camp::idx_t bc_col = c_reg / num_ac_reg_per_col;
-
-            RAJA_UNROLL
-            for(camp::idx_t b_row = 0;b_row < M_SIZE;++ b_row){
-              camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
-
-              C.get_register(c_reg) =
-                  register_type(B.get(b_row, bc_col)).multiply_add(
-                      A.get_register(a_reg),
-                      C.get_register(c_reg));
-            }
-          }
-
-
-        }
-
-        /*
-         * Matrix A (and C) have less than one register per column
-         *
-         */
-        template<typename dummy = void>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static
-        typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-        {
-          constexpr camp::idx_t ac_segbits = result_type::s_segbits;
-          constexpr camp::idx_t b_segments_per_register = 1<<ac_segbits;
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE static RAJA_INLINE
+      typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
+      multiply_accumulate(left_type const &A,
+                          right_type const &B,
+                          result_type &C)
+  {
 
-          camp::idx_t bc_col = 0;
+#if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
+    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
+#endif
 
-          RAJA_UNROLL
-          for(camp::idx_t c_reg = 0;c_reg < N_SIZE/result_type::s_major_dim_per_register;++ c_reg){
 
-            RAJA_UNROLL
-            for(camp::idx_t c_segment = 0;c_segment < result_type::s_major_dim_per_register;++ c_segment){
+    constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
 
-              register_type c_tmp;
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg) {
+      camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
+      camp::idx_t bc_col = c_reg / num_ac_reg_per_col;
 
-              RAJA_UNROLL
-              for(camp::idx_t a_reg = 0;a_reg < right_type::s_num_registers;++ a_reg){
+      RAJA_UNROLL
+      for (camp::idx_t b_row = 0; b_row < M_SIZE; ++b_row) {
+        camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
 
+        C.get_register(c_reg) =
+            register_type(B.get(b_row, bc_col))
+                .multiply_add(A.get_register(a_reg), C.get_register(c_reg));
+      }
+    }
+  }
 
-                camp::idx_t b_segment = bc_col*right_type::s_num_registers + a_reg;
-                camp::idx_t b_reg = b_segment / b_segments_per_register;
-                camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
+  /*
+   * Matrix A (and C) have less than one register per column
+   *
+   */
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE RAJA_INLINE static
+      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
+      multiply_accumulate(left_type const &A,
+                          right_type const &B,
+                          result_type &C)
+  {
+    constexpr camp::idx_t ac_segbits = result_type::s_segbits;
+    constexpr camp::idx_t b_segments_per_register = 1 << ac_segbits;
 
-                register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(ac_segbits, b_reg_segment);
+    camp::idx_t bc_col = 0;
 
-                if(a_reg == 0){
-                  c_tmp = b_tmp.multiply(A.get_register(a_reg));
-                }
-                else{
-                  c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
-                }
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0;
+         c_reg < N_SIZE / result_type::s_major_dim_per_register;
+         ++c_reg) {
 
-              }
+      RAJA_UNROLL
+      for (camp::idx_t c_segment = 0;
+           c_segment < result_type::s_major_dim_per_register;
+           ++c_segment) {
 
-              C.get_register(c_reg) += c_tmp.segmented_sum_outer(ac_segbits, c_segment);
+        register_type c_tmp;
 
-              ++ bc_col;
-            } // c_segment
-          } // c_reg
+        RAJA_UNROLL
+        for (camp::idx_t a_reg = 0; a_reg < right_type::s_num_registers;
+             ++a_reg) {
 
 
-        }
+          camp::idx_t b_segment = bc_col * right_type::s_num_registers + a_reg;
+          camp::idx_t b_reg = b_segment / b_segments_per_register;
+          camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
 
+          register_type b_tmp =
+              B.get_register(b_reg).segmented_broadcast_outer(ac_segbits,
+                                                              b_reg_segment);
 
-        RAJA_HOST_DEVICE
-        static
-        RAJA_INLINE
-        void multiply(left_type const &A, right_type const &B, result_type &C){
-          C = result_type(0);
-          self_type::multiply_accumulate(A, B, C);
+          if (a_reg == 0) {
+            c_tmp = b_tmp.multiply(A.get_register(a_reg));
+          } else {
+            c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
+          }
         }
-    };
 
+        C.get_register(c_reg) +=
+            c_tmp.segmented_sum_outer(ac_segbits, c_segment);
 
+        ++bc_col;
+      }  // c_segment
+    }    // c_reg
+  }
 
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+  RAJA_HOST_DEVICE
+  static RAJA_INLINE void multiply(left_type const &A,
+                                   right_type const &B,
+                                   result_type &C)
+  {
+    C = result_type(0);
+    self_type::multiply_accumulate(A, B, C);
+  }
+};
 
 
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index 3036a096b5..125ea5664c 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -18,11 +18,11 @@
 #ifndef RAJA_pattern_tensor_MatrixRegisterImpl_HPP
 #define RAJA_pattern_tensor_MatrixRegisterImpl_HPP
 
-#include "camp/camp.hpp"
 #include "RAJA/config.hpp"
 #include "RAJA/pattern/tensor/MatrixRegister.hpp"
 #include "RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp"
 #include "RAJA/util/BitMask.hpp"
+#include "camp/camp.hpp"
 
 //#define DEBUG_MATRIX_LOAD_STORE
 
@@ -32,1121 +32,1245 @@ namespace RAJA
 namespace expt
 {
 
-  /*
-   * 2D (Matrix) specialization of TensorRegister
-   */
-  template<typename REGISTER_POLICY, typename T, camp::idx_t ROW_ORD, camp::idx_t COL_ORD, camp::idx_t ROW_SIZE, camp::idx_t COL_SIZE>
-  class TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>> :
-    public RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+/*
+ * 2D (Matrix) specialization of TensorRegister
+ */
+template <typename REGISTER_POLICY,
+          typename T,
+          camp::idx_t ROW_ORD,
+          camp::idx_t COL_ORD,
+          camp::idx_t ROW_SIZE,
+          camp::idx_t COL_SIZE>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>
+    : public RAJA::internal::expt::TensorRegisterBase<
+          TensorRegister<REGISTER_POLICY,
+                         T,
+                         TensorLayout<ROW_ORD, COL_ORD>,
+                         camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+{
+public:
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   TensorLayout<ROW_ORD, COL_ORD>,
+                                   camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+  using base_type = RAJA::internal::expt::TensorRegisterBase<
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
+  using register_type = Register<T, REGISTER_POLICY>;
+  using row_vector_type = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
+  using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
+  using register_policy = REGISTER_POLICY;
+  using element_type = T;
+  using layout_type = TensorLayout<ROW_ORD, COL_ORD>;
+
+  using transpose_tensor_type =
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<!ROW_ORD, !COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+
+  using transpose_type = TensorRegister<REGISTER_POLICY,
+                                        T,
+                                        layout_type,
+                                        camp::idx_seq<COL_SIZE, ROW_SIZE>>;
+  using product_type = TensorRegister<REGISTER_POLICY,
+                                      T,
+                                      layout_type,
+                                      camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
+
+  static constexpr camp::idx_t s_num_rows = ROW_SIZE;
+  static constexpr camp::idx_t s_num_columns = COL_SIZE;
+
+
+  static constexpr camp::idx_t s_elements_per_register =
+      RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem;
+
+  // number of registers to hold entire matrix
+  static constexpr camp::idx_t s_num_registers =
+      (ROW_SIZE * COL_SIZE) / s_elements_per_register;
+
+  // We only allow matrix sizes that exactly fit in some number of registers
+  static_assert((ROW_SIZE * COL_SIZE) ==
+                    s_num_registers * s_elements_per_register,
+                "MatrixRegister must be dimensioned to exactly fit an integer "
+                "number of registers");
+
+  using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
+
+  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
+
+  static constexpr camp::idx_t s_mask_per_register =
+      (1 << log_base2_t::value) - 1;
+
+
+  static constexpr camp::idx_t s_minor_dim_elements =
+      layout_type::is_row_major() ? s_num_columns : s_num_rows;
+
+  static constexpr camp::idx_t s_major_dim_elements =
+      layout_type::is_row_major() ? s_num_rows : s_num_columns;
+
+  // number of (full) registers that span the minor dim
+  // if a single register is split across multiple rows or columns, then
+  // this is 0
+  static constexpr camp::idx_t s_minor_dim_registers =
+      s_minor_dim_elements / s_elements_per_register;
+
+  static_assert(s_minor_dim_registers > 0 || log_base2_t::is_exact,
+                "Minor dimension smaller than a vector need to be a power of "
+                "two fraction");
+
+  static_assert(s_minor_dim_registers == 0 ||
+                    (s_minor_dim_elements % s_elements_per_register == 0),
+                "Minor dimensions greater than a vector length must be an "
+                "integer number of vectors");
+
+
+  static constexpr camp::idx_t s_major_dim_per_register =
+      s_elements_per_register / s_minor_dim_elements;
+
+  static constexpr camp::idx_t s_segbits =
+      RAJA::LogBase2<s_minor_dim_elements>::value;
+
+private:
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX row,
+                                                                 IDX col) -> IDX
   {
-    public:
-      using self_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-      using base_type = RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
-      using register_type = Register<T, REGISTER_POLICY>;
-      using row_vector_type = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
-      using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
-      using register_policy = REGISTER_POLICY;
-      using element_type = T;
-      using layout_type = TensorLayout<ROW_ORD, COL_ORD>;
-
-      using transpose_tensor_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<!ROW_ORD, !COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-
-      using transpose_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<COL_SIZE, ROW_SIZE>>;
-      using product_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
+    return layout_type::is_row_major()
+               ? (row * IDX(COL_SIZE) + col) >> IDX(s_shift_per_register)
+               : (col * IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
+  }
+
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX row, IDX col)
+      -> IDX
+  {
+    return layout_type::is_row_major()
+               ? (row * IDX(COL_SIZE) + col) & IDX(s_mask_per_register)
+               : (col * IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
+  }
 
-      static constexpr camp::idx_t s_num_rows = ROW_SIZE;
-      static constexpr camp::idx_t s_num_columns = COL_SIZE;
+  using base_type::m_registers;
 
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegister() : base_type() {}
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegister(element_type c) : base_type(c) { this->broadcast(c); }
 
 
-      static constexpr camp::idx_t s_elements_per_register =
-          RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegister(self_type const &c) : base_type(c) { this->copy(c); }
 
-      // number of registers to hold entire matrix
-      static constexpr camp::idx_t s_num_registers =
-          (ROW_SIZE*COL_SIZE) / s_elements_per_register;
 
-      // We only allow matrix sizes that exactly fit in some number of registers
-      static_assert((ROW_SIZE*COL_SIZE) == s_num_registers*s_elements_per_register,
-          "MatrixRegister must be dimensioned to exactly fit an integer number of registers");
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~TensorRegister() {}
 
-      using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
 
-      static constexpr camp::idx_t s_shift_per_register =
-          log_base2_t::value;
+  /*!
+   * Returns true if the underlying data packed for a given tensor ref
+   *
+   * This is true if either:
+   *   It's column major and the rows are stride one
+   *   It's row major and the columns are stride one
+   */
+  template <camp::idx_t STRIDE_ONE_DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
+  {
+    return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
+           (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
+  }
 
-      static constexpr camp::idx_t s_mask_per_register =
-          (1<<log_base2_t::value)-1;
+  /*!
+   * Gets the maximum size of matrix along specified dimension
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
+  {
+    return dim == 0 ? ROW_SIZE : COL_SIZE;
+  }
 
 
-      static constexpr camp::idx_t s_minor_dim_elements =
-          layout_type::is_row_major() ? s_num_columns : s_num_rows;
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator=(element_type value)
+  {
+    this->broadcast(value);
+    return *this;
+  }
 
-      static constexpr camp::idx_t s_major_dim_elements =
-          layout_type::is_row_major() ? s_num_rows : s_num_columns;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator=(self_type const &c) { return this->copy(c); }
 
-      // number of (full) registers that span the minor dim
-      // if a single register is split across multiple rows or columns, then
-      // this is 0
-      static constexpr camp::idx_t s_minor_dim_registers =
-              s_minor_dim_elements / s_elements_per_register;
 
-      static_assert(s_minor_dim_registers >0  ||  log_base2_t::is_exact,
-          "Minor dimension smaller than a vector need to be a power of two fraction");
+  /*!
+   * Provide matrix-matrix multiply for operator* between to matrices
+   */
+  template <typename T2, typename L, typename RP>
+  self_type operator*(SquareMatrixRegister<T2, L, RP> const &y) const
+  {
+    return matrix_multiply(y);
+  }
 
-      static_assert(s_minor_dim_registers == 0 || (s_minor_dim_elements % s_elements_per_register == 0),
-          "Minor dimensions greater than a vector length must be an integer number of vectors");
+  /*!
+   * Provide right matrix-vector multiply for operator* between this
+   * matrix and a vector.
+   */
+  template <typename T2, typename RP>
+  VectorRegister<T2, RP> operator*(VectorRegister<T2, RP> const &y) const
+  {
+    return right_multiply_vector(y);
+  }
 
 
-      static constexpr camp::idx_t s_major_dim_per_register =
-          s_elements_per_register / s_minor_dim_elements;
+  template <typename REF_TYPE>
+  struct RefBridge;
 
-      static constexpr camp::idx_t s_segbits = RAJA::LogBase2<s_minor_dim_elements>::value;
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type &load_ref(REF_TYPE const &ref)
+  {
+    RefBridge<REF_TYPE>::load_ref(*this, ref);
+    return *this;
+  }
 
-    private:
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const &store_ref(REF_TYPE &ref) const
+  {
+    RefBridge<REF_TYPE>::store_ref(*this, ref);
+    return *this;
+  }
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<
+      RAJA::internal::expt::
+          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>> {
+
+    using RefType = RAJA::internal::expt::
+        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
+
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void load_ref(self_type &self, RefType const &ref)
+    {
+
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
+
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>()) {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else {
+          self.load_packed_nm(ptr,
+                              ref.m_stride[0],
+                              ref.m_stride[1],
+                              ref.m_tile.m_size[0],
+                              ref.m_tile.m_size[1]);
+        }
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_register(IDX row, IDX col) -> IDX {
-        return layout_type::is_row_major() ?
-            (row*IDX(COL_SIZE) + col) >> IDX(s_shift_per_register) :
-            (col*IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
       }
-
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_lane(IDX row, IDX col) -> IDX {
-        return layout_type::is_row_major() ?
-            (row*IDX(COL_SIZE) + col) & IDX(s_mask_per_register) :
-            (col*IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
+      // strided data
+      else {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else {
+          self.load_strided_nm(ptr,
+                               ref.m_stride[0],
+                               ref.m_stride[1],
+                               ref.m_tile.m_size[0],
+                               ref.m_tile.m_size[1]);
+        }
       }
+    }
 
-      using base_type::m_registers;
 
-    public:
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void store_ref(self_type const &self, RefType &ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegister() : base_type() {}
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>()) {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else {
+          self.store_packed_nm(ptr,
+                               ref.m_stride[0],
+                               ref.m_stride[1],
+                               ref.m_tile.m_size[0],
+                               ref.m_tile.m_size[1]);
+        }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(element_type c) : base_type(c)
-      {
-        this->broadcast(c);
       }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(self_type const &c) : base_type(c)
-      {
-        this->copy(c);
+      // strided data
+      else {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else {
+          self.store_strided_nm(ptr,
+                                ref.m_stride[0],
+                                ref.m_stride[1],
+                                ref.m_tile.m_size[0],
+                                ref.m_tile.m_size[1]);
+        }
       }
+    }
+  };
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE StrideInt1,
+            INDEX_TYPE StrideInt2,
+            INDEX_TYPE BeginInt1,
+            INDEX_TYPE BeginInt2,
+            INDEX_TYPE SizeInt1,
+            INDEX_TYPE SizeInt2,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
+      camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
+      camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
+      STRIDE_ONE_DIM>> {
+
+    using RefType = RAJA::internal::expt::StaticTensorRef<
+        POINTER_TYPE,
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
+        camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
+        camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
+        STRIDE_ONE_DIM>;
+
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void load_ref(self_type &self, RefType const &ref)
+    {
+
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
+
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>()) {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else {
+          self.load_packed_nm(ptr,
+                              ref.m_stride[0],
+                              ref.m_stride[1],
+                              ref.m_tile.m_size[0],
+                              ref.m_tile.m_size[1]);
+        }
 
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~TensorRegister(){}
-
-
-      /*!
-       * Returns true if the underlying data packed for a given tensor ref
-       *
-       * This is true if either:
-       *   It's column major and the rows are stride one
-       *   It's row major and the columns are stride one
-       */
-      template<camp::idx_t STRIDE_ONE_DIM>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_ref_packed() {
-        return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
-            (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
       }
-
-      /*!
-       * Gets the maximum size of matrix along specified dimension
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
-        return dim == 0 ? ROW_SIZE : COL_SIZE;
+      // strided data
+      else {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else {
+          self.load_strided_nm(ptr,
+                               ref.m_stride[0],
+                               ref.m_stride[1],
+                               ref.m_tile.m_size[0],
+                               ref.m_tile.m_size[1]);
+        }
       }
+    }
 
 
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void store_ref(self_type const &self, RefType &ref)
+    {
 
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
-      {
-        this->broadcast(value);
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        return this->copy(c);
-      }
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>()) {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else {
+          self.store_packed_nm(ptr,
+                               ref.m_stride[0],
+                               ref.m_stride[1],
+                               ref.m_tile.m_size[0],
+                               ref.m_tile.m_size[1]);
+        }
 
-      /*!
-       * Provide matrix-matrix multiply for operator* between to matrices
-       */
-      template<typename T2, typename L, typename RP>
-      self_type
-      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
-      {
-        return matrix_multiply(y);
       }
-
-      /*!
-       * Provide right matrix-vector multiply for operator* between this
-       * matrix and a vector.
-       */
-      template<typename T2, typename RP>
-      VectorRegister<T2, RP>
-      operator*(VectorRegister<T2, RP> const &y) const
-      {
-        return right_multiply_vector(y);
+      // strided data
+      else {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else {
+          self.store_strided_nm(ptr,
+                                ref.m_stride[0],
+                                ref.m_stride[1],
+                                ref.m_tile.m_size[0],
+                                ref.m_tile.m_size[1]);
+        }
       }
+    }
+  };
+
+
+  /*!
+   * Loads a dense full matrix from memory.
+   *
+   * For row-major, column entries must be stride-1
+   * For column-major, row entries must be stride-1
+   *
+   * Non-stride-1 dimension can have any striding... so this is can
+   * be a "semi-dense" matrix.
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr,
+                         int row_stride,
+                         int col_stride)
+  {
+    // if it's dense in columns and rows, just do a dense load
+    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
+        (layout_type::is_column_major() && (col_stride == ROW_SIZE))) {
 
-
-      template<typename REF_TYPE>
-      struct RefBridge;
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type& load_ref (REF_TYPE const &ref){
-          RefBridge<REF_TYPE>::load_ref(*this,ref);
-          return *this;
+      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg) {
+        m_registers[reg].load_packed(ptr + reg * s_elements_per_register);
       }
 
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_ref (REF_TYPE &ref) const {
-          RefBridge<REF_TYPE>::store_ref(*this,ref);
-          return *this;
-      }
+    }
+    // Do semi-dense load for row-major
+    else if (layout_type::is_row_major()) {
 
+      // one or more registers per column
+      if (s_minor_dim_registers) {
+        camp::idx_t reg = 0;
+        for (camp::idx_t row = 0; row < ROW_SIZE; ++row) {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers;
+               ++colreg) {
 
+            camp::idx_t offset =
+                row * row_stride + colreg * s_elements_per_register;
 
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
-      {
-
-          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void load_ref(self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-    
-    
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>())
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
+            m_registers[reg].load_packed(ptr + offset);
 
-      };
-
-
-
-
-      template<
-           typename POINTER_TYPE,
-           typename INDEX_TYPE,
-           RAJA::internal::expt::TensorTileSize TENSOR_SIZE, 
-           INDEX_TYPE StrideInt1, INDEX_TYPE StrideInt2,
-           INDEX_TYPE  BeginInt1, INDEX_TYPE  BeginInt2,
-           INDEX_TYPE   SizeInt1, INDEX_TYPE   SizeInt2,
-           camp::idx_t STRIDE_ONE_DIM
-      >
-      struct RefBridge
-      <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>>
-      {
-
-          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void load_ref(self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
+            reg++;
           }
-    
-    
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>())
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-
-      };
-
-
-
-
-
-      /*!
-       * Loads a dense full matrix from memory.
-       *
-       * For row-major, column entries must be stride-1
-       * For column-major, row entries must be stride-1
-       *
-       * Non-stride-1 dimension can have any striding... so this is can
-       * be a "semi-dense" matrix.
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr,
-          int row_stride, int col_stride)
-      {
-        // if it's dense in columns and rows, just do a dense load
-        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
-           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
-
-          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-            m_registers[reg].load_packed(ptr + reg*s_elements_per_register);
-          }
-
         }
-        // Do semi-dense load for row-major
-        else if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            camp::idx_t reg = 0;
-            for(camp::idx_t row = 0;row < ROW_SIZE;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+      }
+      // more than one column per register
+      else {
+        // default to strided operation
+        return load_strided(ptr, row_stride, col_stride);
+      }
+    }
+    // Do semi-dense load for column-major
+    else {
+      // one or more registers per row
+      if (s_minor_dim_registers) {
 
-                camp::idx_t offset = row*row_stride + colreg*s_elements_per_register;
+        camp::idx_t reg = 0;
+        for (camp::idx_t col = 0; col < COL_SIZE; ++col) {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers;
+               ++rowreg) {
 
-                m_registers[reg].load_packed(ptr + offset);
+            camp::idx_t offset =
+                col * col_stride + rowreg * s_elements_per_register;
 
-                reg ++;
+            m_registers[reg].load_packed(ptr + offset);
 
-              }
-            }
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided(ptr, row_stride, col_stride);
+            reg++;
           }
         }
-        // Do semi-dense load for column-major
-        else{
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            camp::idx_t reg = 0;
-            for(camp::idx_t col = 0;col < COL_SIZE;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
-
-                camp::idx_t offset = col*col_stride + rowreg*s_elements_per_register;
+      }
+      // more than one column per register
+      else {
+        // default to strided operation
+        return load_strided(ptr, row_stride, col_stride);
+      }
+    }
 
-                m_registers[reg].load_packed(ptr + offset);
+    return *this;
+  }
 
-                reg ++;
+  /*!
+   * Loads a strided full matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr,
+                          int row_stride,
+                          int col_stride)
+  {
 
-              }
-            }
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided(ptr, row_stride, col_stride);
-          }
+    if (layout_type::is_row_major()) {
+      // one or more registers per row
+      if (s_minor_dim_registers) {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      col_stride);
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a strided full matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr,
-          int row_stride, int col_stride)
-      {
-
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
-            }
-          }
-          // less than one register per row
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load(ptr_i, s_segbits, col_stride, row_stride);
-            }
-          }
+      // less than one register per row
+      else {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          element_type const *ptr_i =
+              ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load(ptr_i,
+                                        s_segbits,
+                                        col_stride,
+                                        row_stride);
         }
+      }
+    }
 
-        // column major
-        else{
+    // column major
+    else {
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+      // one or more registers per column
+      if (s_minor_dim_registers) {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load(ptr_i, s_segbits, row_stride, col_stride);
-            }
-          }
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      row_stride);
         }
-
-        return *this;
       }
+      // less than one register per column
+      else {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          element_type const *ptr_i =
+              ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load(ptr_i,
+                                        s_segbits,
+                                        row_stride,
+                                        col_stride);
+        }
+      }
+    }
 
-      /*!
-       * Loads a dense partial matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_nm(element_type const *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols)
-      {
-
-        if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
+    return *this;
+  }
 
-            for(camp::idx_t row = 0;row < num_rows;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+  /*!
+   * Loads a dense partial matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_packed_nm(element_type const *ptr,
+                            int row_stride,
+                            int col_stride,
+                            int num_rows,
+                            int num_cols)
+  {
 
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+    if (layout_type::is_row_major()) {
 
-                camp::idx_t col0 = colreg*s_elements_per_register;
-                camp::idx_t offset = row*row_stride + col0;
+      // one or more registers per column
+      if (s_minor_dim_registers) {
 
-                // loading a complete register
-                if(col0+s_elements_per_register <= num_cols){
-                  m_registers[reg].load_packed(ptr + offset);
-                }
+        for (camp::idx_t row = 0; row < num_rows; ++row) {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers;
+               ++colreg) {
 
-                // partial register at end of row
-                else{
-                  m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                  // zero out the remaining registers, if any
-                  for(camp::idx_t i = colreg+1;i < s_minor_dim_registers;++i){
-                    reg++;
-                    m_registers[reg] = element_type(0);
-                  }
+            camp::idx_t col0 = colreg * s_elements_per_register;
+            camp::idx_t offset = row * row_stride + col0;
 
-                  break; // end this row
-                }
-              }
+            // loading a complete register
+            if (col0 + s_elements_per_register <= num_cols) {
+              m_registers[reg].load_packed(ptr + offset);
             }
 
-            // zero out remaining rows
-            for(camp::idx_t row = num_rows;row < ROW_SIZE;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
-
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+            // partial register at end of row
+            else {
+              m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
 
+              // zero out the remaining registers, if any
+              for (camp::idx_t i = colreg + 1; i < s_minor_dim_registers; ++i) {
+                reg++;
                 m_registers[reg] = element_type(0);
               }
+
+              break;  // end this row
             }
           }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
-          }
         }
-        // Do semi-dense load for column-major
-        else{
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
+        // zero out remaining rows
+        for (camp::idx_t row = num_rows; row < ROW_SIZE; ++row) {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers;
+               ++colreg) {
 
-            for(camp::idx_t col = 0;col < num_cols;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+            m_registers[reg] = element_type(0);
+          }
+        }
+      }
+      // more than one column per register
+      else {
+        // default to strided operation
+        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
+    // Do semi-dense load for column-major
+    else {
 
-                camp::idx_t row0 = rowreg*s_elements_per_register;
-                camp::idx_t offset = col*col_stride + row0;
+      // one or more registers per column
+      if (s_minor_dim_registers) {
 
-                // loading a complete register
-                if(row0+s_elements_per_register <= num_rows){
-                  m_registers[reg].load_packed(ptr + offset);
-                }
+        for (camp::idx_t col = 0; col < num_cols; ++col) {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers;
+               ++rowreg) {
 
-                // partial register at end of column
-                else{
-                  m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-                  // zero out the remaining registers, if any
-                  for(camp::idx_t i = rowreg+1;i < s_minor_dim_registers;++i){
-                    reg++;
-                    m_registers[reg] = element_type(0);
-                  }
+            camp::idx_t row0 = rowreg * s_elements_per_register;
+            camp::idx_t offset = col * col_stride + row0;
 
-                  break; // end this column
-                }
-              }
+            // loading a complete register
+            if (row0 + s_elements_per_register <= num_rows) {
+              m_registers[reg].load_packed(ptr + offset);
             }
-            // zero out remaining columns
-            for(camp::idx_t col = num_cols;col < COL_SIZE;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+            // partial register at end of column
+            else {
+              m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
 
+              // zero out the remaining registers, if any
+              for (camp::idx_t i = rowreg + 1; i < s_minor_dim_registers; ++i) {
+                reg++;
                 m_registers[reg] = element_type(0);
               }
-            }
 
+              break;  // end this column
+            }
           }
-          // more than one column per register
-          else{
+        }
+        // zero out remaining columns
+        for (camp::idx_t col = num_cols; col < COL_SIZE; ++col) {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers;
+               ++rowreg) {
 
-            // default to strided operation
-            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
+
+            m_registers[reg] = element_type(0);
           }
         }
 
-        return *this;
       }
+      // more than one column per register
+      else {
 
-      /*!
-       * Loads a strided partial matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_nm(element_type const *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols)
-      {
-
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row >= num_rows){
-                m_registers[i] = element_type(0);
-              }
-              else{
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+        // default to strided operation
+        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
 
+    return *this;
+  }
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
-                }
-                else{
-                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
-                }
-
+  /*!
+   * Loads a strided partial matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_strided_nm(element_type const *ptr,
+                             int row_stride,
+                             int col_stride,
+                             int num_rows,
+                             int num_cols)
+  {
 
-              }
+    if (layout_type::is_row_major()) {
+      // one or more registers per row
+      if (s_minor_dim_registers) {
+
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row >= num_rows) {
+            m_registers[i] = element_type(0);
+          } else {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
+
+
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols) {
+              reg_num_cols = num_cols - col;
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            col_stride,
+                                            reg_num_cols);
+            } else {
+              m_registers[i].load_strided(ptr + row * row_stride +
+                                              col * col_stride,
+                                          col_stride);
             }
           }
-          // less than one register per row
-          else
-          {
-
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
-
-              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+        }
+      }
+      // less than one register per row
+      else {
+
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows = reg_num_rows > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_rows;
+
+          element_type const *ptr_i =
+              ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load_nm(
+              ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+        }
+      }
+    }
+
+    // column major
+    else {
+
+      // one or more registers per column
+      if (s_minor_dim_registers) {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col >= num_cols) {
+            m_registers[i] = element_type(0);
+          } else {
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
+
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows) {
+              reg_num_rows = num_rows - row;
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            row_stride,
+                                            reg_num_rows);
+            } else {
+              m_registers[i].load_strided(ptr + row * row_stride +
+                                              col * col_stride,
+                                          row_stride);
             }
           }
         }
+      }
+      // less than one register per column
+      else {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols = reg_num_cols > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_cols;
+
+          element_type const *ptr_i =
+              ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load_nm(
+              ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+        }
+      }
+    }
 
-        // column major
-        else{
+    return *this;
+  }
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col >= num_cols){
-                m_registers[i] = element_type(0);
-              }
-              else{
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
 
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
-                }
-                else{
-                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
-                }
-              }
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
-
-              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
-            }
-          }
-        }
+  /*!
+   * Store a dense full matrix to memory.
+   *
+   * Column entries must be stride-1, rows may be any striding
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr,
+                                int row_stride,
+                                int col_stride) const
+  {
 
-        return *this;
+    // if it's dense in columns and rows, just do a dense load
+    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
+        (layout_type::is_column_major() && (col_stride == ROW_SIZE))) {
+
+      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg) {
+        m_registers[reg].store_packed(ptr + reg * s_elements_per_register);
       }
 
+    }
+    // Do semi-dense store for row-major
+    else if (layout_type::is_row_major()) {
+
+      // one or more registers per column
+      if (s_minor_dim_registers) {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
+        }
+      }
+      // more than one column per register
+      else {
+        store_strided(ptr, row_stride, col_stride);
+      }
+    }
+    // Do semi-dense store for column-major
+    else {
+      // one or more registers per row
+      if (s_minor_dim_registers) {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
+        }
+      }
+      // more than one row per register
+      else {
+        store_strided(ptr, row_stride, col_stride);
+      }
+    }
 
 
-      /*!
-       * Store a dense full matrix to memory.
-       *
-       * Column entries must be stride-1, rows may be any striding
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr,
-          int row_stride, int col_stride) const
-      {
+    return *this;
+  }
 
-        // if it's dense in columns and rows, just do a dense load
-        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
-           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
+  /*!
+   * Store a strided full matrix to memory
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr,
+                                 int row_stride,
+                                 int col_stride) const
+  {
 
-          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-            m_registers[reg].store_packed(ptr + reg*s_elements_per_register);
-          }
 
+    if (layout_type::is_row_major()) {
+      // one or more registers per row
+      if (s_minor_dim_registers) {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].store_strided(ptr + row * row_stride +
+                                           col * col_stride,
+                                       col_stride);
         }
-        // Do semi-dense store for row-major
-        else if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
-            }
-          }
-          // more than one column per register
-          else{
-            store_strided(ptr, row_stride, col_stride);
-          }
-        }
-        // Do semi-dense store for column-major
-        else{
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
-            }
-          }
-          // more than one row per register
-          else{
-            store_strided(ptr, row_stride, col_stride);
-          }
+      }
+      // less than one register per row
+      else {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          element_type *ptr_i = ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store(ptr_i,
+                                         s_segbits,
+                                         col_stride,
+                                         row_stride);
         }
-
-
-        return *this;
       }
-
-      /*!
-       * Store a strided full matrix to memory
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr,
-          int row_stride, int col_stride) const
-      {
-
-
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
-            }
-          }
-          // less than one register per row
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store(ptr_i, s_segbits, col_stride, row_stride);
-            }
-          }
+    }
+
+    // column major
+    else {
+      // one or more registers per column
+      if (s_minor_dim_registers) {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+          m_registers[i].store_strided(ptr + row * row_stride +
+                                           col * col_stride,
+                                       row_stride);
         }
-
-        // column major
-        else{
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store(ptr_i, s_segbits, row_stride, col_stride);
-            }
-          }
+      }
+      // less than one register per column
+      else {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          element_type *ptr_i = ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store(ptr_i,
+                                         s_segbits,
+                                         row_stride,
+                                         col_stride);
         }
-
-        return *this;
       }
+    }
 
-      /*!
-       * Store a dense partial matrix to memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_nm(element_type *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols) const
-      {
+    return *this;
+  }
 
+  /*!
+   * Store a dense partial matrix to memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_packed_nm(element_type *ptr,
+                                   int row_stride,
+                                   int col_stride,
+                                   int num_rows,
+                                   int num_cols) const
+  {
 
-        if(layout_type::is_row_major()){
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
+    if (layout_type::is_row_major()) {
 
-            for(camp::idx_t row = 0;row < num_rows;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+      // one or more registers per column
+      if (s_minor_dim_registers) {
 
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+        for (camp::idx_t row = 0; row < num_rows; ++row) {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers;
+               ++colreg) {
 
-                camp::idx_t col0 = colreg*s_elements_per_register;
-                camp::idx_t offset = row*row_stride + col0;
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                // store a complete register
-                if(col0+s_elements_per_register <= num_cols){
-                  m_registers[reg].store_packed(ptr + offset);
-                }
+            camp::idx_t col0 = colreg * s_elements_per_register;
+            camp::idx_t offset = row * row_stride + col0;
 
-                // partial register at end of row
-                else{
-                  m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
-
-                  break; // end this row
-                }
-              }
+            // store a complete register
+            if (col0 + s_elements_per_register <= num_cols) {
+              m_registers[reg].store_packed(ptr + offset);
             }
 
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+            // partial register at end of row
+            else {
+              m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
+
+              break;  // end this row
+            }
           }
         }
-        // Do semi-dense store for column-major
-        else{
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t col = 0;col < num_cols;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
+      }
+      // more than one column per register
+      else {
+        // default to strided operation
+        return store_strided_nm(
+            ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
+    // Do semi-dense store for column-major
+    else {
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+      // one or more registers per column
+      if (s_minor_dim_registers) {
 
-                camp::idx_t row0 = rowreg*s_elements_per_register;
-                camp::idx_t offset = col*col_stride + row0;
+        for (camp::idx_t col = 0; col < num_cols; ++col) {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers;
+               ++rowreg) {
 
-                // loading a complete register
-                if(row0+s_elements_per_register <= num_rows){
-                  m_registers[reg].store_packed(ptr + offset);
-                }
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-                // partial register at end of column
-                else{
-                  m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
+            camp::idx_t row0 = rowreg * s_elements_per_register;
+            camp::idx_t offset = col * col_stride + row0;
 
-                  break; // end this column
-                }
-              }
+            // loading a complete register
+            if (row0 + s_elements_per_register <= num_rows) {
+              m_registers[reg].store_packed(ptr + offset);
             }
 
-          }
-          // more than one column per register
-          else{
+            // partial register at end of column
+            else {
+              m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
 
-            // default to strided operation
-            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+              break;  // end this column
+            }
           }
         }
 
-        return *this;
       }
+      // more than one column per register
+      else {
 
-      /*!
-       * Store a strided partial matrix to memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_nm(element_type *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols) const
-      {
-
-
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row < num_rows){
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-
-
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
-                }
-                else{
-                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
-                }
+        // default to strided operation
+        return store_strided_nm(
+            ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
 
+    return *this;
+  }
 
-              }
-            }
-          }
-          // less than one register per row
-          else
-          {
+  /*!
+   * Store a strided partial matrix to memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_strided_nm(element_type *ptr,
+                                    int row_stride,
+                                    int col_stride,
+                                    int num_rows,
+                                    int num_cols) const
+  {
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+    if (layout_type::is_row_major()) {
+      // one or more registers per row
+      if (s_minor_dim_registers) {
+
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row < num_rows) {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
+
+
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols) {
+              reg_num_cols = num_cols - col;
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             col_stride,
+                                             reg_num_cols);
+            } else {
+              m_registers[i].store_strided(ptr + row * row_stride +
+                                               col * col_stride,
+                                           col_stride);
             }
           }
         }
-
-        // column major
-        else{
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col < num_cols){
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
-                }
-                else{
-                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
-                }
-              }
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
-
-              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+      }
+      // less than one register per row
+      else {
+
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows = reg_num_rows > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_rows;
+
+          element_type *ptr_i = ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store_nm(
+              ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+        }
+      }
+    }
+
+    // column major
+    else {
+
+      // one or more registers per column
+      if (s_minor_dim_registers) {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col < num_cols) {
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
+
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows) {
+              reg_num_rows = num_rows - row;
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             row_stride,
+                                             reg_num_rows);
+            } else {
+              m_registers[i].store_strided(ptr + row * row_stride +
+                                               col * col_stride,
+                                           row_stride);
             }
           }
         }
-
-        return *this;
       }
+      // less than one register per column
+      else {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols = reg_num_cols > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_cols;
+
+          element_type *ptr_i = ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store_nm(
+              ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+        }
+      }
+    }
 
+    return *this;
+  }
 
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_nm(self_type const &mat, int num_rows, int num_cols) const {
-        self_type result;
-
-
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row < num_rows){
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_nm(self_type const &mat, int num_rows, int num_cols) const
+  {
+    self_type result;
 
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
-                }
-                else{
-                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
-                }
+    if (layout_type::is_row_major()) {
+      // one or more registers per row
+      if (s_minor_dim_registers) {
 
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row < num_rows) {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
-              }
-            }
-          }
-          // less than one register per row
-          else
-          {
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
-
-              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols) {
+              reg_num_cols = num_cols - col;
+              result.m_registers[i] =
+                  m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
+            } else {
+              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
             }
           }
         }
-
-        // column major
-        else{
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col < num_cols){
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
-                }
-                else{
-                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
-                }
-              }
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
-
-              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
+      }
+      // less than one register per row
+      else {
+
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows = reg_num_rows > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_rows;
+
+          result.m_registers[i] = m_registers[i].segmented_divide_nm(
+              mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
+        }
+      }
+    }
+
+    // column major
+    else {
+
+      // one or more registers per column
+      if (s_minor_dim_registers) {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col < num_cols) {
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
+
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows) {
+              reg_num_rows = num_rows - row;
+              result.m_registers[i] =
+                  m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
+            } else {
+              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
             }
           }
         }
-
-
-        return result;
       }
+      // less than one register per column
+      else {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols = reg_num_cols > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_cols;
+
+          result.m_registers[i] = m_registers[i].segmented_divide_nm(
+              mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
+        }
+      }
+    }
 
 
+    return result;
+  }
 
-      /*!
-       * Matrix transpose, keeping layout
-       *
-       * Transpose is not completely implemented
-       */
+
+  /*!
+   * Matrix transpose, keeping layout
+   *
+   * Transpose is not completely implemented
+   */
 #if 0
       RAJA_HOST_DEVICE
       RAJA_INLINE
@@ -1291,386 +1415,406 @@ namespace expt
         return reinterpret_cast<transpose_tensor_type const &>(*this);
       }
 #endif
-      /*!
-       * Matrix vector product
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      column_vector_type right_multiply_vector(row_vector_type v) const {
-        column_vector_type result(0);
-        return right_multiply_vector_accumulate(v, result);
-      }
-
-      /*!
-       * Matrix vector product
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      row_vector_type left_multiply_vector(column_vector_type v) const {
-        row_vector_type result(0);
-        return left_multiply_vector_accumulate(v, result);
-      }
-
-
-      /*!
-       * Matrix vector product with accumulation into another vector
-       *
-       * acc += (this) * v
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      column_vector_type right_multiply_vector_accumulate(row_vector_type const &v, column_vector_type result) const {
-
-        if(layout_type::is_row_major()){
-
-          // 1 register is split over multiple rows
-          if(s_minor_dim_registers == 0){
-
-            // start by broadcasting the first segment in v across all of v
-            // we will use this term for all registers in the matrix
-            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
-
-            // loop over output segments, which is also the number of
-            // registers in the matrix (no kidding!)
-            RAJA_UNROLL
-            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
-
-              // compute which result register we are accumulating into
-              camp::idx_t result_reg = outseg >> s_segbits;
+  /*!
+   * Matrix vector product
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  column_vector_type right_multiply_vector(row_vector_type v) const
+  {
+    column_vector_type result(0);
+    return right_multiply_vector_accumulate(v, result);
+  }
 
-              // compute which segment within result_reg we are accumulating into
-              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
+  /*!
+   * Matrix vector product
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  row_vector_type left_multiply_vector(column_vector_type v) const
+  {
+    row_vector_type result(0);
+    return left_multiply_vector_accumulate(v, result);
+  }
 
-              // compute segmented dot product to get output segment
-              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
-              // accumulate result
-              result.get_register(result_reg) += value;
-            }
+  /*!
+   * Matrix vector product with accumulation into another vector
+   *
+   * acc += (this) * v
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  column_vector_type right_multiply_vector_accumulate(
+      row_vector_type const &v,
+      column_vector_type result) const
+  {
 
-          }
-          // one or more registers per row
-          else{
+    if (layout_type::is_row_major()) {
 
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t row = 0;row < s_num_rows;++ row){
+      // 1 register is split over multiple rows
+      if (s_minor_dim_registers == 0) {
 
-              // compute partial dot products for all registers in this row
-              auto rowsum = register_type(0);
-              RAJA_UNROLL
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
+        // start by broadcasting the first segment in v across all of v
+        // we will use this term for all registers in the matrix
+        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-                rowsum = m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
-                reg ++;
+        // loop over output segments, which is also the number of
+        // registers in the matrix (no kidding!)
+        RAJA_UNROLL
+        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg) {
 
-              } // rowreg
+          // compute which result register we are accumulating into
+          camp::idx_t result_reg = outseg >> s_segbits;
 
-              // finish dot product by taking sum of rowsum
-              auto value = result.get(row) + rowsum.sum();
-              result.set(value, row);
+          // compute which segment within result_reg we are accumulating into
+          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
 
-            } // row
-          }
+          // compute segmented dot product to get output segment
+          auto value =
+              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
+          // accumulate result
+          result.get_register(result_reg) += value;
         }
-        else{
 
+      }
+      // one or more registers per row
+      else {
 
-          // 1 register is split over multiple columns
-          if(s_minor_dim_registers == 0){
-
-            auto &mv = result.get_register(0);
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t row = 0; row < s_num_rows; ++row) {
 
-            // Loop over registers, which are also the segments in v
-            RAJA_UNROLL
-            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
-              camp::idx_t v_reg = m_reg >> s_segbits;
-              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
+          // compute partial dot products for all registers in this row
+          auto rowsum = register_type(0);
+          RAJA_UNROLL
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers;
+               ++colreg) {
 
-              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-              mv = m_registers[m_reg].multiply_add(v_tmp, mv);
+            rowsum =
+                m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
+            reg++;
 
-            }
+          }  // rowreg
 
-            // Now sum segments in mv together to form final result
-            mv = mv.segmented_sum_outer(s_segbits, 0);
+          // finish dot product by taking sum of rowsum
+          auto value = result.get(row) + rowsum.sum();
+          result.set(value, row);
 
-          }
-          // one or more registers per column
-          else{
+        }  // row
+      }
 
-            // Loop over columns (which is also registers)
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t col = 0;col < s_num_columns;++ col){
+    } else {
 
-              // extract column value from v
-              auto v_col = register_type(v.get(col));
 
-              // apply v_col to entire column (1 or more registers)
-              RAJA_UNROLL
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
+      // 1 register is split over multiple columns
+      if (s_minor_dim_registers == 0) {
 
-                auto &mv = result.get_register(rowreg);
-                mv = m_registers[reg].multiply_add(v_col, mv);
+        auto &mv = result.get_register(0);
 
-                reg ++;
-
-              } // rowreg
-            } // col
-          }
+        // Loop over registers, which are also the segments in v
+        RAJA_UNROLL
+        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg) {
+          camp::idx_t v_reg = m_reg >> s_segbits;
+          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
 
+          auto v_tmp =
+              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+          mv = m_registers[m_reg].multiply_add(v_tmp, mv);
         }
-        return result;
-      }
-
-      /*!
-       * Matrix vector product with accumulation into another vector
-       *
-       * acc += v * (this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      row_vector_type left_multiply_vector_accumulate(column_vector_type const &v, row_vector_type result) const {
 
-        if(layout_type::is_row_major()){
+        // Now sum segments in mv together to form final result
+        mv = mv.segmented_sum_outer(s_segbits, 0);
 
-          // 1 register is split over multiple columns
-          if(s_minor_dim_registers == 0){
-            auto &vm = result.get_register(0);
+      }
+      // one or more registers per column
+      else {
 
-            // Loop over registers, which are also the segments in v
-            RAJA_UNROLL
-            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
-              camp::idx_t v_reg = m_reg >> s_segbits;
-              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
+        // Loop over columns (which is also registers)
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t col = 0; col < s_num_columns; ++col) {
 
-              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-              vm = m_registers[m_reg].multiply_add(v_tmp, vm);
+          // extract column value from v
+          auto v_col = register_type(v.get(col));
 
-            }
+          // apply v_col to entire column (1 or more registers)
+          RAJA_UNROLL
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers;
+               ++rowreg) {
 
-            // Now sum segments in mv together to form final result
-            vm = vm.segmented_sum_outer(s_segbits, 0);
+            auto &mv = result.get_register(rowreg);
+            mv = m_registers[reg].multiply_add(v_col, mv);
 
-          }
-          // one or more registers per row
-          else{
+            reg++;
 
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t row = 0;row < s_num_rows;++ row){
-              auto lhs_bcat = register_type(v.get(row));
-              RAJA_UNROLL
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
+          }  // rowreg
+        }    // col
+      }
+    }
+    return result;
+  }
+
+  /*!
+   * Matrix vector product with accumulation into another vector
+   *
+   * acc += v * (this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  row_vector_type left_multiply_vector_accumulate(column_vector_type const &v,
+                                                  row_vector_type result) const
+  {
 
-                result.get_register(colreg) =
-                    m_registers[reg].multiply_add(lhs_bcat, result.get_register(colreg));
-                reg ++;
+    if (layout_type::is_row_major()) {
 
-              } // rowreg
+      // 1 register is split over multiple columns
+      if (s_minor_dim_registers == 0) {
+        auto &vm = result.get_register(0);
 
-            }
+        // Loop over registers, which are also the segments in v
+        RAJA_UNROLL
+        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg) {
+          camp::idx_t v_reg = m_reg >> s_segbits;
+          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
 
-          }
+          auto v_tmp =
+              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+          vm = m_registers[m_reg].multiply_add(v_tmp, vm);
+        }
 
+        // Now sum segments in mv together to form final result
+        vm = vm.segmented_sum_outer(s_segbits, 0);
 
-        } // row-major
+      }
+      // one or more registers per row
+      else {
+
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t row = 0; row < s_num_rows; ++row) {
+          auto lhs_bcat = register_type(v.get(row));
+          RAJA_UNROLL
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers;
+               ++colreg) {
+
+            result.get_register(colreg) =
+                m_registers[reg].multiply_add(lhs_bcat,
+                                              result.get_register(colreg));
+            reg++;
+
+          }  // rowreg
+        }
+      }
 
-        // Column-major:
-        else{
-          // 1 register is split over multiple rows
-          if(s_minor_dim_registers == 0){
 
-            // start by broadcasting the first segment in v across all of v
-            // we will use this term for all registers in the matrix
-            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
+    }  // row-major
 
-            // loop over output segments, which is also the number of
-            // registers in the matrix (no kidding!)
-            RAJA_UNROLL
-            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
+    // Column-major:
+    else {
+      // 1 register is split over multiple rows
+      if (s_minor_dim_registers == 0) {
 
-              // compute which result register we are accumulating into
-              camp::idx_t result_reg = outseg >> s_segbits;
+        // start by broadcasting the first segment in v across all of v
+        // we will use this term for all registers in the matrix
+        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-              // compute which segment within result_reg we are accumulating into
-              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
+        // loop over output segments, which is also the number of
+        // registers in the matrix (no kidding!)
+        RAJA_UNROLL
+        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg) {
 
-              // compute segmented dot product to get output segment
-              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
+          // compute which result register we are accumulating into
+          camp::idx_t result_reg = outseg >> s_segbits;
 
-              // accumulate result
-              result.get_register(result_reg) += value;
-            }
+          // compute which segment within result_reg we are accumulating into
+          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
 
-          }
-          // one or more registers per column
-          else{
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t col = 0;col < s_num_columns;++ col){
-
-              // compute partial dot products for all registers in this row
-              auto colsum = register_type(0);
-              RAJA_UNROLL
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
-                colsum = m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
-                reg ++;
-
-              } // rowreg
-
-              // finish dot product by taking sum of rowsum
-              auto value = result.get(col) + colsum.sum();
-              result.set(value, col);
-
-            } // col
-          }
+          // compute segmented dot product to get output segment
+          auto value =
+              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
+          // accumulate result
+          result.get_register(result_reg) += value;
+        }
 
-        } // col-major
-        return result;
       }
-
-
-
-
-
-      /*!
-       * Matrix-Matrix product
-       */
-      template<typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply(RMAT const &mat) const {
-        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(0);
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply(*this, mat, res);
-        return res;
+      // one or more registers per column
+      else {
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t col = 0; col < s_num_columns; ++col) {
+
+          // compute partial dot products for all registers in this row
+          auto colsum = register_type(0);
+          RAJA_UNROLL
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers;
+               ++rowreg) {
+            colsum =
+                m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
+            reg++;
+
+          }  // rowreg
+
+          // finish dot product by taking sum of rowsum
+          auto value = result.get(col) + colsum.sum();
+          result.set(value, col);
+
+        }  // col
       }
 
-      /*!
-       * Matrix-Matrix multiply add
-       */
-      template<typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply_add(RMAT const &B, typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type const &C) const {
-        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(C);
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, res);
-        return res;
-      }
 
-      /*!
-       * Matrix-Matrix multiply accumulate
-       */
-      template<typename ACCMAT, typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      void
-      matrix_multiply_accumulate(ACCMAT &acc, RMAT const &B) const {
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, acc);
-      }
+    }  // col-major
+    return result;
+  }
 
 
+  /*!
+   * Matrix-Matrix product
+   */
+  template <typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
+      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply(RMAT const &mat) const
+  {
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,
+                                                              RMAT>::result_type
+        res(0);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::multiply(
+        *this, mat, res);
+    return res;
+  }
+
+  /*!
+   * Matrix-Matrix multiply add
+   */
+  template <typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
+      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply_add(
+          RMAT const &B,
+          typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+              self_type,
+              RMAT>::result_type const &C) const
+  {
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,
+                                                              RMAT>::result_type
+        res(C);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::
+        multiply_accumulate(*this, B, res);
+    return res;
+  }
+
+  /*!
+   * Matrix-Matrix multiply accumulate
+   */
+  template <typename ACCMAT, typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE void matrix_multiply_accumulate(
+      ACCMAT &acc,
+      RMAT const &B) const
+  {
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::
+        multiply_accumulate(*this, B, acc);
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &set(element_type val, int row, int col){
-        m_registers[to_register(row, col)].set(val, to_lane(row,col));
-        return *this;
-      }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type get(int row, int col) const {
-        return m_registers[to_register(row, col)].get(to_lane(row,col));
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &set(element_type val, int row, int col)
+  {
+    m_registers[to_register(row, col)].set(val, to_lane(row, col));
+    return *this;
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type get(int row, int col) const
+  {
+    return m_registers[to_register(row, col)].get(to_lane(row, col));
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type extract_diagonal_register(camp::idx_t starting_column, camp::idx_t segbits, camp::idx_t segment) const {
 
-        register_type result(0);
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type extract_diagonal_register(camp::idx_t starting_column,
+                                          camp::idx_t segbits,
+                                          camp::idx_t segment) const
+  {
 
-        camp::idx_t num_rows = register_type::s_num_elem >> segbits;
-        camp::idx_t num_repeats = 1 << segbits;
+    register_type result(0);
 
-        camp::idx_t col0 = (starting_column + num_rows*segment)%s_num_columns;
-        camp::idx_t row0 = num_rows*segment;
+    camp::idx_t num_rows = register_type::s_num_elem >> segbits;
+    camp::idx_t num_repeats = 1 << segbits;
 
-        for(camp::idx_t i = 0;i < num_rows;++i){
-          camp::idx_t col = (col0 + i) % s_num_columns;
-          camp::idx_t row = row0 + i;
-          auto value = get(row,col);
-          for(camp::idx_t j = 0;j < num_repeats;++j){
-            result.set(value, (i<<segbits) + j);
-          }
-        }
+    camp::idx_t col0 = (starting_column + num_rows * segment) % s_num_columns;
+    camp::idx_t row0 = num_rows * segment;
 
-        return result;
+    for (camp::idx_t i = 0; i < num_rows; ++i) {
+      camp::idx_t col = (col0 + i) % s_num_columns;
+      camp::idx_t row = row0 + i;
+      auto value = get(row, col);
+      for (camp::idx_t j = 0; j < num_repeats; ++j) {
+        result.set(value, (i << segbits) + j);
       }
+    }
 
+    return result;
+  }
 
-      /*!
-       * @brief Converts to matrix to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string(bool one_line=false) const {
-        std::string s = "Matrix(" + std::to_string(s_num_rows) +
-            "x" + std::to_string(s_num_columns);
-        if(!one_line){
-          s +=")\n";
-        }
 
+  /*!
+   * @brief Converts to matrix to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string(bool one_line = false) const
+  {
+    std::string s = "Matrix(" + std::to_string(s_num_rows) + "x" +
+                    std::to_string(s_num_columns);
+    if (!one_line) {
+      s += ")\n";
+    }
 
-        s += "[ ";
 
-        //
-        for(camp::idx_t r = 0;r < s_num_rows; ++ r){
-          if(r > 0){
-            s += ", ";
-            if(!one_line){
-              s+= "\n  ";
-            }
-          }
-          s += "[";
-          for(camp::idx_t c = 0;c < s_num_columns; ++ c){
-            if(c > 0){
-              s += ", ";
-            }
-            s += std::to_string(this->get(r,c));
-          }
-          s += "]";
-        }
+    s += "[ ";
 
-        s += " ]";
-        if(!one_line){
-          s+="\n";
+    //
+    for (camp::idx_t r = 0; r < s_num_rows; ++r) {
+      if (r > 0) {
+        s += ", ";
+        if (!one_line) {
+          s += "\n  ";
         }
-        return s;
       }
+      s += "[";
+      for (camp::idx_t c = 0; c < s_num_columns; ++c) {
+        if (c > 0) {
+          s += ", ";
+        }
+        s += std::to_string(this->get(r, c));
+      }
+      s += "]";
+    }
 
-  }; // MatrixRegisterImpl
-
-
+    s += " ]";
+    if (!one_line) {
+      s += "\n";
+    }
+    return s;
+  }
 
+};  // MatrixRegisterImpl
 
 
-} // namespace expt
+}  // namespace expt
 }  // namespace RAJA
 
 
-
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
index 3480fda10c..c7f9687080 100644
--- a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
@@ -19,1198 +19,1184 @@
 #define RAJA_pattern_tensor_RegisterBase_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
-#include "camp/camp.hpp"
 #include "RAJA/pattern/tensor/TensorLayout.hpp"
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
-#include "RAJA/util/BitMask.hpp"
-
 #include "RAJA/policy/tensor/arch.hpp"
+#include "RAJA/util/BitMask.hpp"
+#include "RAJA/util/macros.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
 namespace expt
 {
-  template<typename T, typename REGISTER_POLICY>
-  class Register;
+template <typename T, typename REGISTER_POLICY>
+class Register;
 }
 
 namespace internal
 {
 namespace expt
 {
-  class RegisterConcreteBase {};
+class RegisterConcreteBase
+{
+};
 
 
-  /*
-   * Overload for:    arithmetic + TensorRegister
+/*
+ * Overload for:    arithmetic + TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).add(rhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
+{
+  return RIGHT(lhs).add(rhs);
+}
 
-  /*
-   * Overload for:    arithmetic - TensorRegister
+/*
+ * Overload for:    arithmetic - TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).subtract(rhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
+{
+  return RIGHT(lhs).subtract(rhs);
+}
 
-  /*
-   * Overload for:    arithmetic * TensorRegister
+/*
+ * Overload for:    arithmetic * TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return rhs.scale(lhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
+{
+  return rhs.scale(lhs);
+}
 
-  /*
-   * Overload for:    arithmetic / TensorRegister
+/*
+ * Overload for:    arithmetic / TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
+{
+  return RIGHT(lhs).divide(rhs);
+}
+
+
+/*!
+ * Register base class that provides some default behaviors and simplifies
+ * the implementation of new register types.
+ *
+ * This uses CRTP to provide static polymorphism
+ */
+template <typename Derived>
+class RegisterBase;
+
+template <typename T, typename REGISTER_POLICY>
+class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
+    : public RegisterConcreteBase
+{
+public:
+  using self_type = RAJA::expt::Register<T, REGISTER_POLICY>;
+  using element_type = camp::decay<T>;
+
+  using index_type = camp::idx_t;
+
+  using int_element_type =
+      typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
+  using int_vector_type =
+      RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
+
+private:
+  RAJA_HOST_DEVICE
   RAJA_INLINE
+  self_type *getThis() { return static_cast<self_type *>(this); }
+
   RAJA_HOST_DEVICE
-  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
+  RAJA_INLINE
+  constexpr self_type const *getThis() const
   {
-    return RIGHT(lhs).divide(rhs);
+    return static_cast<self_type const *>(this);
   }
 
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return true; }
 
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr RegisterBase() {}
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~RegisterBase() {}
 
-  /*!
-   * Register base class that provides some default behaviors and simplifies
-   * the implementation of new register types.
-   *
-   * This uses CRTP to provide static polymorphism
-   */
-  template<typename Derived>
-  class RegisterBase;
 
-  template<typename T, typename REGISTER_POLICY>
-  class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>> :
-    public RegisterConcreteBase
-  {
-    public:
-      using self_type = RAJA::expt::Register<T, REGISTER_POLICY>;
-      using element_type = camp::decay<T>;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr RegisterBase(RegisterBase const &) {}
 
-      using index_type = camp::idx_t;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr RegisterBase(self_type const &) {}
 
-      using int_element_type = typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
-      using int_vector_type = RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
 
-    private:
+  /*!
+   * @brief Broadcast scalar value to first N register elements
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static self_type s_broadcast_n(element_type const &value, camp::idx_t N)
+  {
+    self_type x;
+    for (camp::idx_t i = 0; i < N; ++i) {
+      x.set(value, i);
+    }
+    return x;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type *getThis(){
-        return static_cast<self_type *>(this);
-      }
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.broadcast(getThis()->get(i));
+    return x;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      self_type const *getThis() const{
-        return static_cast<self_type const *>(this);
-      }
 
-    public:
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type &gather(
+      element_type const *ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> offsets)
+  {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+    RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i) {
+      getThis()->set(ptr[offsets.get(i)], i);
+    }
+    return *getThis();
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return true;
-      }
 
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type &gather_n(
+      element_type const *ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets,
+      camp::idx_t N)
+  {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+    RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+    for (camp::idx_t i = 0; i < N; ++i) {
+      getThis()->set(ptr[offsets.get(i)], i);
+    }
+    return *getThis();
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      RegisterBase(){}
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~RegisterBase(){}
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &segmented_load(element_type const *ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    getThis()->gather(ptr,
+                      self_type::s_segmented_offsets(segbits,
+                                                     stride_inner,
+                                                     stride_outer));
+    return *getThis();
+  }
 
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &segmented_load_nm(element_type const *ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
 
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size = 1 << segbits;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      RegisterBase(RegisterBase const &){}
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg) {
+      for (camp::idx_t i = 0; i < seg_size; ++i) {
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      RegisterBase(self_type const &){
-      }
+        if (seg >= num_outer || i >= num_inner) {
+          getThis()->set(element_type(0), lane);
+        } else {
 
+          camp::idx_t offset = seg * stride_outer + i * stride_inner;
 
+          element_type value = ptr[offset];
 
-      /*!
-       * @brief Broadcast scalar value to first N register elements
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      self_type s_broadcast_n(element_type const &value, camp::idx_t N){
-        self_type x;
-        for(camp::idx_t i = 0;i < N;++ i){
-          x.set(value, i);
+          getThis()->set(value, lane);
         }
-        return x;
-      }
 
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.broadcast(getThis()->get(i));
-        return x;
+        lane++;
       }
+    }
 
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> offsets){
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const &scatter(
+      element_type *ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-          getThis()->set(ptr[offsets.get(i)], i);
-        }
-        return *getThis();
-      }
-
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i) {
+      ptr[offsets.get(i)] = getThis()->get(i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N){
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const &scatter_n(
+      element_type *ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets,
+      camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-          for(camp::idx_t i = 0;i < N;++ i){
-            getThis()->set(ptr[offsets.get(i)], i);
-          }
-          return *getThis();
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
-        return *getThis();
-      }
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
+    for (camp::idx_t i = 0; i < N; ++i) {
+      ptr[offsets.get(i)] = getThis()->get(i);
+    }
+    return *getThis();
+  }
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &segmented_store(element_type *ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    getThis()->scatter(ptr,
+                       self_type::s_segmented_offsets(segbits,
+                                                      stride_inner,
+                                                      stride_outer));
+    return *getThis();
+  }
 
-            if(seg >= num_outer || i >= num_inner){
-              getThis()->set(element_type(0), lane);
-            }
-            else{
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &segmented_store_nm(element_type *ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
 
-              camp::idx_t offset = seg*stride_outer + i*stride_inner;
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size = 1 << segbits;
 
-              element_type value = ptr[offset];
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg) {
+      for (camp::idx_t i = 0; i < seg_size; ++i) {
 
-              getThis()->set(value, lane);
+        if (!(seg >= num_outer || i >= num_inner)) {
 
-            }
+          camp::idx_t offset = seg * stride_outer + i * stride_inner;
 
-            lane ++;
-          }
+          ptr[offset] = getThis()->get(lane);
         }
 
-        return *getThis();
+        lane++;
       }
+    }
 
+    return *getThis();
+  }
 
+  /*!
+   * @brief Set entire register to a single scalar value
+   * @param value Value to set all register elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator=(element_type value)
+  {
+    getThis()->broadcast(value);
+    return *getThis();
+  }
 
+  /*!
+   * @brief Set entire register to a single scalar value
+   * @param value Value to set all register elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type &operator=(
+      RAJA::expt::Register<T2, RAJA::expt::scalar_register> const &value)
+  {
+    getThis()->broadcast(value.get(0));
+    return *getThis();
+  }
 
+  /*!
+   * @brief Assign one register to another
+   * @param x register to copy
+   * @return Value of (*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator=(self_type const &x)
+  {
+    getThis()->copy(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets) const {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
-#endif
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-          ptr[offsets.get(i)] = getThis()->get(i);
-        }
-        return *getThis();
-      }
 
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N) const {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
-#endif
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[offsets.get(i)] = getThis()->get(i);
-        }
-        return *getThis();
-      }
+  /*!
+   * @brief Add two registers
+   * @param x register to add
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(self_type const &x) const { return getThis()->add(x); }
 
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        getThis()->scatter(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
-        return *getThis();
-      }
+  /*!
+   * @brief Add a register to this register
+   * @param x register to add
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator+=(self_type const &x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
+  /*!
+   * @brief Add scalar to this register
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   *
+   * This broadcasts the scalar to all lanes, then adds to this register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(element_type const &x) const { return getThis()->add(x); }
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+  /*!
+   * @brief Add a scalar to this register
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   *
+   * This broadcasts the scalar to all lanes, then adds to this register
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator+=(element_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
-            if(!(seg >= num_outer || i >= num_inner)){
+  /*!
+   * @brief Negate the value of this register
+   * @return Value of -(*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-() const { return self_type(0).subtract(*getThis()); }
 
-              camp::idx_t offset = seg*stride_outer + i*stride_inner;
+  /*!
+   * @brief Subtract two register registers
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(self_type const &x) const
+  {
+    return getThis()->subtract(x);
+  }
 
-              ptr[offset] = getThis()->get(lane);
+  /*!
+   * @brief Subtract a register from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator-=(self_type const &x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
-            }
+  /*!
+   * @brief Subtract scalar from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(element_type const &x) const
+  {
+    return getThis()->subtract(x);
+  }
 
-            lane ++;
-          }
-        }
+  /*!
+   * @brief Subtract a scalar from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator-=(element_type const &x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
-        return *getThis();
-      }
+  /*!
+   * @brief Multiply two register registers, element wise
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*(RHS const &rhs) const
+  {
+    return getThis()->multiply(rhs);
+  }
 
-      /*!
-       * @brief Set entire register to a single scalar value
-       * @param value Value to set all register elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
-      {
-        getThis()->broadcast(value);
-        return *getThis();
-      }
+  /*!
+   * @brief Multiply a register with this register
+   * @param x register to multiple with this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type &operator*=(RHS const &rhs)
+  {
+    *getThis() = getThis()->multiply(rhs);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Set entire register to a single scalar value
-       * @param value Value to set all register elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(RAJA::expt::Register<T2, RAJA::expt::scalar_register> const &value)
-      {
-        getThis()->broadcast(value.get(0));
-        return *getThis();
-      }
+  /*!
+   * @brief Divide two register registers, element wise
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(self_type const &x) const { return getThis()->divide(x); }
 
-      /*!
-       * @brief Assign one register to another
-       * @param x register to copy
-       * @return Value of (*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &x)
-      {
-        getThis()->copy(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Divide this register by another register
+   * @param x register to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator/=(self_type const &x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
 
 
+  /*!
+   * @brief Divide by a scalar, element wise
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(element_type const &x) const
+  {
+    return getThis()->divide(x);
+  }
 
+  /*!
+   * @brief Divide this register by another register
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator/=(element_type const &x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
 
 
-      /*!
-       * @brief Add two registers
-       * @param x register to add
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(self_type const &x) const
-      {
-        return getThis()->add(x);
-      }
+  /*!
+   * @brief Divide n elements of this register by another register
+   * @param x register to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t n) const
+  {
+    self_type q(*getThis());
+    for (camp::idx_t i = 0; i < n; ++i) {
+      q.set(getThis()->get(i) / b.get(i), i);
+    }
+    return q;
+  }
 
+  /*!
+   * @brief Divide n elements of this register by a scalar
+   * @param x Scalar to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(element_type const &b, camp::idx_t n) const
+  {
+    self_type q(*getThis());
+    for (camp::idx_t i = 0; i < n; ++i) {
+      q.set(getThis()->get(i) / b, i);
+    }
+    return q;
+  }
 
-      /*!
-       * @brief Add a register to this register
-       * @param x register to add
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(self_type const &x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Dot product of two registers
+   * @param x Other register to dot with this register
+   * @return Value of (*this) dot x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type dot(self_type const &x) const
+  {
+    return getThis()->multiply(x).sum();
+  }
 
-      /*!
-       * @brief Add scalar to this register
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       *
-       * This broadcasts the scalar to all lanes, then adds to this register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(element_type const &x) const
-      {
-        return getThis()->add(x);
-      }
+  /*!
+   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+   *
+   * Derived types can override this to implement intrinsic FMA's
+   *
+   * @param b Second product operand
+   * @param c Sum operand
+   * @return Value of (*this)*b+c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const &b, self_type const &c) const
+  {
+    return (self_type(*getThis()) * self_type(b)) + self_type(c);
+  }
 
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const &b, self_type const &c) const
+  {
+    return getThis()->multiply_add(b, -c);
+  }
 
-      /*!
-       * @brief Add a scalar to this register
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       *
-       * This broadcasts the scalar to all lanes, then adds to this register
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(element_type x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
+  /*!
+   * Multiply this tensor by a scalar value
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type scale(element_type c) const
+  {
+    return getThis()->multiply(self_type(c));
+  }
 
-      /*!
-       * @brief Negate the value of this register
-       * @return Value of -(*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-() const
-      {
-        return self_type(0).subtract(*getThis());
-      }
+  /*!
+   * Minimum value across first N lanes of register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min_n(camp::idx_t N) const { return getThis()->min(N); }
 
-      /*!
-       * @brief Subtract two register registers
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(self_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
+  /*!
+   * Maximum value across first N lanes of register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max_n(camp::idx_t N) const { return getThis()->max(N); }
 
-      /*!
-       * @brief Subtract a register from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(self_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
+  /*!
+   * Provides vector-level building block for matrix transpose operations.
+   *
+   * This is a non-optimized reference version which will be used if
+   * no architecture specialized version is supplied
+   *
+   * This is a permute-and-shuffle left operation
+   *
+   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+   *
+   *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
+   *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
+   *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type transpose_shuffle_left(camp::idx_t lvl, self_type const &y) const
+  {
+    auto const &x = *getThis();
 
-      /*!
-       * @brief Subtract scalar from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(element_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
+    self_type z;
 
-      /*!
-       * @brief Subtract a scalar from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(element_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i) {
 
-      /*!
-       * @brief Multiply two register registers, element wise
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator*(RHS const &rhs) const
-      {
-        return getThis()->multiply(rhs);
-      }
+      // extract value x or y
+      camp::idx_t xy_select = (i >> lvl) & 0x1;
 
-      /*!
-       * @brief Multiply a register with this register
-       * @param x register to multiple with this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator*=(RHS const &rhs)
-      {
-        *getThis() = getThis()->multiply(rhs);
-        return *getThis();
-      }
 
-      /*!
-       * @brief Divide two register registers, element wise
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(self_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
+      z.set(xy_select == 0 ? x.get(i) : y.get(i - (1 << lvl)), i);
+    }
 
-      /*!
-       * @brief Divide this register by another register
-       * @param x register to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(self_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
+    return z;
+  }
 
 
-      /*!
-       * @brief Divide by a scalar, element wise
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(element_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
+  /*!
+   * Provides vector-level building block for matrix transpose operations.
+   *
+   * This is a non-optimized reference version which will be used if
+   * no architecture specialized version is supplied
+   *
+   * This is a permute-and-shuffle right operation
+   *
+   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+   *
+   *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
+   *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
+   *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type transpose_shuffle_right(int lvl, self_type const &y) const
+  {
+    auto const &x = *getThis();
 
-      /*!
-       * @brief Divide this register by another register
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(element_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
+    self_type z;
 
+    camp::idx_t i0 = 1 << lvl;
 
-      /*!
-       * @brief Divide n elements of this register by another register
-       * @param x register to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t n) const {
-        self_type q(*getThis());
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(getThis()->get(i) / b.get(i), i);
-        }
-        return q;
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i) {
 
-      /*!
-       * @brief Divide n elements of this register by a scalar
-       * @param x Scalar to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(element_type const &b, camp::idx_t n) const {
-        self_type q(*getThis());
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(getThis()->get(i) / b, i);
-        }
-        return q;
-      }
+      // extract value x or y
+      camp::idx_t xy_select = (i >> lvl) & 0x1;
 
-      /*!
-       * @brief Dot product of two registers
-       * @param x Other register to dot with this register
-       * @return Value of (*this) dot x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type dot(self_type const &x) const
-      {
-        return getThis()->multiply(x).sum();
-      }
+      z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1 << lvl)), i);
+    }
 
-      /*!
-       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-       *
-       * Derived types can override this to implement intrinsic FMA's
-       *
-       * @param b Second product operand
-       * @param c Sum operand
-       * @return Value of (*this)*b+c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return (self_type(*getThis()) * self_type(b)) + self_type(c);
-      }
+    return z;
+  }
 
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return getThis()->multiply_add(b, -c);
-      }
 
-      /*!
-       * Multiply this tensor by a scalar value
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type scale(element_type c) const
-      {
-        return getThis()->multiply(self_type(c));
-      }
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
 
-      /*!
-       * Minimum value across first N lanes of register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min_n(camp::idx_t N) const
-      {
-        return getThis()->min(N);
-      }
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size = 1 << segbits;
 
-      /*!
-       * Maximum value across first N lanes of register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max_n(camp::idx_t N) const
-      {
-        return getThis()->max(N);
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg) {
+      for (camp::idx_t i = 0; i < seg_size; ++i) {
+        result.set(seg * stride_outer + i * stride_inner, lane);
+        lane++;
       }
+    }
 
-      /*!
-       * Provides vector-level building block for matrix transpose operations.
-       *
-       * This is a non-optimized reference version which will be used if
-       * no architecture specialized version is supplied
-       *
-       * This is a permute-and-shuffle left operation
-       *
-       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-       *
-       *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
-       *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
-       *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type transpose_shuffle_left(camp::idx_t lvl, self_type const &y) const
-      {
-        auto const &x = *getThis();
-
-        self_type z;
-
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-
-          // extract value x or y
-          camp::idx_t xy_select = (i >> lvl) & 0x1;
-
-
-          z.set(xy_select == 0 ? x.get(i) : y.get(i - (1<<lvl)), i);
-        }
+    return result;
+  }
 
-        return z;
-      }
 
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+    self_type result(0);
 
-      /*!
-       * Provides vector-level building block for matrix transpose operations.
-       *
-       * This is a non-optimized reference version which will be used if
-       * no architecture specialized version is supplied
-       *
-       * This is a permute-and-shuffle right operation
-       *
-       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-       *
-       *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
-       *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
-       *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type transpose_shuffle_right(int lvl, self_type const &y) const
-      {
-        auto const &x = *getThis();
-
-        self_type z;
-
-        camp::idx_t i0 = 1<<lvl;
-
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-
-          // extract value x or y
-          camp::idx_t xy_select = (i >> lvl) & 0x1;
-
-          z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1<<lvl)), i);
-        }
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    int output_offset = output_segment * self_type::s_num_elem >> segbits;
 
-        return z;
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i) {
+      auto value =
+          getThis()->get(i) + result.get((i >> segbits) + output_offset);
+      result.set(value, (i >> segbits) + output_offset);
+    }
+
+    return result;
+  }
 
+  /*!
+   * Sum all segments as subvectors, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 the segments are size 1, which means that this is just a
+   *      sum of all elements.  The output_segment determines where the
+   *      result is placed.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
+   *
+   *      output_segment=3:
+   *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
+   *
+   *  segbits=1 the segments are 2-wide:
+   *
+   *      output_segment=0:
+   *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
+   *
+   *  and so on up to segbits=3, which is just the original vector:
+   *  segbits=3
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+    self_type result(0);
 
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    int output_offset = output_segment * (1 << segbits);
 
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i) {
+      camp::idx_t output_i = output_offset + (i & ((1 << segbits) - 1));
+      auto value = getThis()->get(i) + result.get(output_i);
+      result.set(value, output_i);
+    }
 
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
+    return result;
+  }
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
-            result.set(seg*stride_outer + i*stride_inner, lane);
-            lane ++;
-          }
-        }
+  RAJA_INLINE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
 
-        return result;
-      }
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size = 1 << segbits;
 
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg) {
+      for (camp::idx_t i = 0; i < seg_size; ++i) {
 
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-        self_type result(0);
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        int output_offset = output_segment * self_type::s_num_elem>>segbits;
-
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          auto value = getThis()->get(i) + result.get((i >> segbits)+output_offset);
-          result.set(value, (i >> segbits)+output_offset);
-        }
+        if (seg >= num_outer || i >= num_inner) {
+          result.set(element_type(0), lane);
+        } else {
 
-        return result;
-      }
+          element_type div = getThis()->get(lane) / den.get(lane);
 
-      /*!
-       * Sum all segments as subvectors, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 the segments are size 1, which means that this is just a
-       *      sum of all elements.  The output_segment determines where the
-       *      result is placed.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
-       *
-       *      output_segment=3:
-       *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
-       *
-       *  segbits=1 the segments are 2-wide:
-       *
-       *      output_segment=0:
-       *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
-       *
-       *  and so on up to segbits=3, which is just the original vector:
-       *  segbits=3
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-        self_type result(0);
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        int output_offset = output_segment * (1<<segbits);
-
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          camp::idx_t output_i = output_offset + (i&((1<<segbits)-1));
-          auto value = getThis()->get(i) + result.get(output_i);
-          result.set(value, output_i);
+          result.set(div, lane);
         }
 
-        return result;
+        lane++;
       }
+    }
 
+    return result;
+  }
 
 
-      RAJA_INLINE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
-
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
-
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
-
-            if(seg >= num_outer || i >= num_inner){
-              result.set(element_type(0), lane);
-            }
-            else{
-
-              element_type div = getThis()->get(lane) / den.get(lane);
-
-              result.set(div, lane);
-
-            }
-
-            lane ++;
-          }
-        }
-
-        return result;
-      }
+  /*!
+   * Segmented dot product performs dot products
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
+   *
+   *
+   *  segbits=0 is equivalent to a vector multiply,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
+   *
+   *  segbits=1 sums neighboring pairs of products.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
+   *
+   *  and so on up to segbits=3, which is a full dot-product of x and y, and the
+   *      output_segment denotes the vector position of the result
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type segmented_dot(camp::idx_t segbits,
+                          camp::idx_t output_segment,
+                          self_type const &x) const
+  {
+    return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
+  }
 
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      input_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      input_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      input_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      input_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      input_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
 
+    camp::idx_t mask = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
 
-      /*!
-       * Segmented dot product performs dot products
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
-       *
-       *
-       *  segbits=0 is equivalent to a vector multiply,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
-       *
-       *  segbits=1 sums neighboring pairs of products.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
-       *
-       *  and so on up to segbits=3, which is a full dot-product of x and y, and the
-       *      output_segment denotes the vector position of the result
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type segmented_dot(camp::idx_t segbits, camp::idx_t output_segment, self_type const &x) const
-      {
-        return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
-      }
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i) {
 
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      input_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      input_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      input_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      input_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      input_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-
-          auto off = (i&mask) + offset;
-
-          result.set(getThis()->get(off), i);
-        }
+      auto off = (i & mask) + offset;
 
-        return result;
-      }
+      result.set(getThis()->get(off), i);
+    }
 
+    return result;
+  }
 
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-
-          auto off = (i>>segbits) + offset;
-
-          result.set(getThis()->get(off), i);
-        }
 
-        return result;
-      }
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
 
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
 
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i) {
 
+      auto off = (i >> segbits) + offset;
 
+      result.set(getThis()->get(off), i);
+    }
 
-      /*!
-       * @brief Converts to vector to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string() const {
-        std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
+    return result;
+  }
 
-        //
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          s += std::to_string(getThis()->get(i)) + " ";
-        }
 
-        s += " ]\n";
+  /*!
+   * @brief Converts to vector to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string() const
+  {
+    std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
 
-        return s;
-      }
+    //
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i) {
+      s += std::to_string(getThis()->get(i)) + " ";
+    }
 
-  };
+    s += " ]\n";
 
+    return s;
+  }
+};
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
 
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
index bb53993fed..4a27d9278b 100644
--- a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
@@ -19,355 +19,279 @@
 #define RAJA_pattern_tensor_TensorIndexTraits_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/TensorIndex.hpp"
+#include "RAJA/util/macros.hpp"
 
 namespace RAJA
 {
 
 namespace internal
 {
-    /* Partial specialization for the strip_index_type_t helper in
-       IndexValue.hpp
-    */
-    template<typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
-    struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>>
-    {
-        using type = typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
-    };
+/* Partial specialization for the strip_index_type_t helper in
+   IndexValue.hpp
+*/
+template <typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
+struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>> {
+  using type =
+      typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
+};
 
 
 namespace expt
 {
 
 
+// Helper that strips the Vector type from an argument
+template <typename ARG>
+struct TensorIndexTraits {
+  using arg_type = ARG;
+  using value_type = strip_index_type_t<ARG>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return false; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const &strip(arg_type const &arg) { return arg; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(arg_type const arg)
+  {
+    return arg;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(arg_type const &) { return 1; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(arg_type const &) { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem() { return 1; }
+};
+
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>> {
+  using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using arg_type = IDX;
+  using value_type = strip_index_type_t<IDX>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return true; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const &strip(index_type const &arg) { return *arg; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(index_type const arg)
+  {
+    return (arg_type)arg;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(index_type const &arg) { return arg.size(); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(index_type const &arg)
+  {
+    return arg.begin();
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return DIM; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem()
+  {
+    return TENSOR_TYPE::s_dim_elem(DIM);
+  }
+};
+
+
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          IDX INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
+    RAJA::expt::StaticTensorIndexInner<IDX,
+                                       TENSOR_TYPE,
+                                       DIM,
+                                       INDEX_VALUE,
+                                       LENGTH_VALUE>>> {
+  using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using index_type = RAJA::expt::StaticTensorIndex<
+      RAJA::expt::StaticTensorIndexInner<IDX,
+                                         TENSOR_TYPE,
+                                         DIM,
+                                         INDEX_VALUE,
+                                         LENGTH_VALUE>>;
+  using arg_type = IDX;
+  using value_type = strip_index_type_t<IDX>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return true; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(index_type const)
+  {
+    return INDEX_VALUE;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(index_type const &) { return LENGTH_VALUE; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(index_type const &) { return INDEX_VALUE; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return DIM; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem()
+  {
+    return TENSOR_TYPE::s_dim_elem(DIM);
+  }
+};
+
+/*
+ * Returns vector size of argument.
+ *
+ * For scalars, always returns 1.
+ *
+ * For VectorIndex types, returns the number of vector lanes.
+ */
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr bool isTensorIndex()
+{
+  return TensorIndexTraits<ARG>::isTensorIndex();
+}
+
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto stripTensorIndex(ARG const &arg) ->
+    typename TensorIndexTraits<ARG>::arg_type const &
+{
+  return TensorIndexTraits<ARG>::strip(arg);
+}
 
 
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto stripTensorIndexByValue(
+    ARG const arg) -> typename TensorIndexTraits<ARG>::arg_type const
+{
+  return TensorIndexTraits<ARG>::strip_by_value(arg);
+}
 
+/*
+ * Returns tensor dimension size of argument.
+ *
+ * For VectorIndex types, returns the number of vector lanes.
+ */
+template <typename ARG, typename IDX>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorSize(ARG const &arg,
+                                                         IDX dim_size)
+{
+  return TensorIndexTraits<ARG>::size(arg) >= 0
+             ? IDX(TensorIndexTraits<ARG>::size(arg))
+             : dim_size;
+}
 
-    // Helper that strips the Vector type from an argument
-    template<typename ARG>
-    struct TensorIndexTraits {
-        using arg_type = ARG;
-        using value_type = strip_index_type_t<ARG>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return false;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const &strip(arg_type const &arg){
-          return arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(arg_type const arg){
-          return arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(arg_type const &){
-          return 1;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(arg_type const &){
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return 1;
-        }
-    };
-
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-    struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>> {
-        using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-        using arg_type = IDX;
-        using value_type = strip_index_type_t<IDX>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return true;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const &strip(index_type const &arg){
-          return *arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(index_type const arg){
-          return (arg_type)arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(index_type const &arg){
-          return arg.size();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(index_type const &arg){
-          return arg.begin();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return DIM;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return TENSOR_TYPE::s_dim_elem(DIM);
-        }
-    };
-
-
-
-
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-    struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
-        RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>
-    >> {
-        using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-        using index_type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>;
-        using arg_type = IDX;
-        using value_type = strip_index_type_t<IDX>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return true;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(index_type const){
-          return INDEX_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(index_type const &){
-          return LENGTH_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(index_type const &){
-          return INDEX_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return DIM;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return TENSOR_TYPE::s_dim_elem(DIM);
-        }
-    };
-
-    /*
-     * Returns vector size of argument.
-     *
-     * For scalars, always returns 1.
-     *
-     * For VectorIndex types, returns the number of vector lanes.
-     */
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    bool isTensorIndex()
-    {
-      return TensorIndexTraits<ARG>::isTensorIndex();
-    }
-
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto stripTensorIndex(ARG const &arg) ->
-    typename TensorIndexTraits<ARG>::arg_type const &
-    {
-      return TensorIndexTraits<ARG>::strip(arg);
-    }
-
-
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto stripTensorIndexByValue(ARG const arg) ->
-    typename TensorIndexTraits<ARG>::arg_type const
-    {
-      return TensorIndexTraits<ARG>::strip_by_value(arg);
-    }
-
-    /*
-     * Returns tensor dimension size of argument.
-     *
-     * For VectorIndex types, returns the number of vector lanes.
-     */
-    template<typename ARG, typename IDX>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    IDX getTensorSize(ARG const &arg, IDX dim_size)
-    {
-      return TensorIndexTraits<ARG>::size(arg) >= 0 ?
-          IDX(TensorIndexTraits<ARG>::size(arg)) :
-          dim_size;
-    }
-
-    /*
-     * Returns tensor dimenson beginning index of an argument.
-     *
-     */
-    template<typename ARG, typename IDX>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    IDX getTensorBegin(ARG const &arg, IDX dim_minval)
-    {
-      return TensorIndexTraits<ARG>::begin(arg) >= 0 ?
-          IDX(TensorIndexTraits<ARG>::begin(arg)) :
-          dim_minval;
-    }
-
-    /*
-     * Returns vector dim of argument.
-     *
-     * For scalars, always returns 0.
-     *
-     * For VectorIndex types, returns the DIM argument.
-     * For vector_exec, this is always 0
-     *
-     * For matrices, DIM means:
-     *   0 : Row
-     *   1 : Column
-     */
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto getTensorDim() ->
-      decltype(TensorIndexTraits<ARG>::dim())
-    {
-      return TensorIndexTraits<ARG>::dim();
-    }
-
-} // namespace expt
-
-
-    /*
-     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
-     * includes the vector length with them
-     */
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-    struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-    {
-
-      template<typename Data>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
-      {
-        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-            camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)],
-            camp::get<id>(data.vector_sizes));
-      }
-
-    };
-
-    /*
-     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
-     * includes the vector length with them
-     */
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-    struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-    {
-
-      template<typename Data>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
-      {
-        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-            IDX(camp::get<id>(data.offset_tuple)), // convert offset type to IDX
-            camp::get<id>(data.vector_sizes));
-      }
-
-    };
-
-} // namespace internal
+/*
+ * Returns tensor dimenson beginning index of an argument.
+ *
+ */
+template <typename ARG, typename IDX>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorBegin(ARG const &arg,
+                                                          IDX dim_minval)
+{
+  return TensorIndexTraits<ARG>::begin(arg) >= 0
+             ? IDX(TensorIndexTraits<ARG>::begin(arg))
+             : dim_minval;
+}
+
+/*
+ * Returns vector dim of argument.
+ *
+ * For scalars, always returns 0.
+ *
+ * For VectorIndex types, returns the DIM argument.
+ * For vector_exec, this is always 0
+ *
+ * For matrices, DIM means:
+ *   0 : Row
+ *   1 : Column
+ */
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto getTensorDim()
+    -> decltype(TensorIndexTraits<ARG>::dim())
+{
+  return TensorIndexTraits<ARG>::dim();
+}
+
+}  // namespace expt
+
+
+/*
+ * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+ * includes the vector length with them
+ */
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id> {
+
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
+      TensorIndex<IDX, TENSOR_TYPE, DIM>
+      extract(Data &&data)
+  {
+    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+        camp::get<id>(data.segment_tuple)
+            .begin()[camp::get<id>(data.offset_tuple)],
+        camp::get<id>(data.vector_sizes));
+  }
+};
+
+/*
+ * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+ * includes the vector length with them
+ */
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>,
+                             id> {
+
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
+      TensorIndex<IDX, TENSOR_TYPE, DIM>
+      extract(Data &&data)
+  {
+    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+        IDX(camp::get<id>(data.offset_tuple)),  // convert offset type to IDX
+        camp::get<id>(data.vector_sizes));
+  }
+};
+
+}  // namespace internal
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRef.hpp b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
index 60e31f24b9..756d0ca479 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRef.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
@@ -19,7 +19,6 @@
 #define RAJA_pattern_tensor_tensorref_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/util/macros.hpp"
 
 
@@ -30,656 +29,735 @@ namespace internal
 namespace expt
 {
 
-    template<typename INT_SEQ>
-    struct StaticIndexArray;
-
-    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
-    struct PrependStaticIndexArray;
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
-    struct AddStaticIndexArray;
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
-    struct SetStaticIndexArray;
-
-
-    template<typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>> {
-        
-        using seq_type = camp::int_seq<INDEX_TYPE,HEAD,TAIL...>;
-        using Self = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Tail = StaticIndexArray<camp::int_seq<INDEX_TYPE,TAIL...>>;
-
-        Tail tail;
-
-        RAJA_INLINE
-        StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>() = default;
-       
-	 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static constexpr INDEX_TYPE value_at(size_t index) {
-            if(index == 0){
-                return HEAD;
-            } else {
-                return Tail::value_at(index-1);
-            }
-        }
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        constexpr INDEX_TYPE operator[](size_t index) const {
-            if(index == 0){
-                return HEAD;
-            } else {
-                return tail[index-1];
-            }
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print_values() const {
-            printf("%ld ",(long)HEAD);
-            tail.print_values();
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-            printf("[");
-            print_values();
-            printf("]");
-        }
-
-
-    };
-
-    template<typename INDEX_TYPE>
-    struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
-    {
-
-        using seq_type = camp::int_seq<INDEX_TYPE>;
-
-        RAJA_INLINE
-        StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static constexpr INDEX_TYPE value_at(size_t) {
-            return 0;
-        }
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        constexpr INDEX_TYPE operator[](size_t) const {
-            return 0;
-        }
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print_values() const {}
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-            print("[]");
-        }
-
-    };
-
-    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
-    struct PrependStaticIndexArray<INDEX_TYPE, NEW_HEAD, StaticIndexArray<camp::int_seq<INDEX_TYPE,ORIG_INTS...>>>
-    {
-        using Type = StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
-        using Seq  = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
-    };
-
-
-
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct AddStaticIndexArray<INDEX_TYPE, IDX, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
-    {
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using AddTail = typename AddStaticIndexArray<INDEX_TYPE,IDX-1,DELTA,typename Orig::Tail>::Type;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Seq;
-    };
-
-    template<typename INDEX_TYPE, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct AddStaticIndexArray<INDEX_TYPE, 0, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
-    {
-
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Seq;
-    };
-
-
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
-    {
-        using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using SetTail = typename SetStaticIndexArray<INDEX_TYPE,IDX-1,VALUE,typename Orig::Tail>::Type;
-        using Type    = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Type;
-        using Seq     = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Seq;
-    };
-
-    template<typename INDEX_TYPE, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct SetStaticIndexArray<INDEX_TYPE, 0, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
-    {
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Seq;
-    };
-
-
-    enum TensorTileSize
-    {
-      TENSOR_PARTIAL,  // the tile is a full TensorRegister
-      TENSOR_FULL,     // the tile is a partial TensorRegister
-      TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
-    };
-
-    template<typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
-    struct TensorTile
-    {
-        using self_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-        using nonstatic_self_type = self_type;
-        using index_type = INDEX_TYPE;
-        index_type m_begin[NUM_DIMS];
-        index_type m_size[NUM_DIMS];
-
-        static constexpr camp::idx_t s_num_dims = NUM_DIMS;
-        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
-
-
-        template<typename I, TensorTileSize S>
-        void copy(TensorTile<I, S, NUM_DIMS> const &c)
-        {
-          for(camp::idx_t i = 0;i < NUM_DIMS;++i){
-            m_begin[i] = c.m_begin[i];
-            m_size[i] = c.m_size[i];
-          }
-        }
-
-        /*!
-         * Subtract begin offsets of two tiles.
-         *
-         * The resulting tile has the sizes of the left operand, but has
-         * m_begin[i] = left.m_begin[i] - right.m_begin[i]
-         *
-         */
-        template<typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const &sub) const {
-          self_type result(*this);
-          for(camp::idx_t i = 0;i < s_num_dims; ++ i){
-            result.m_begin[i] -= sub.m_begin[i];
-          }
-          return result;
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("TensorTile: dims=%d, m_begin=[",  (int)NUM_DIMS);
-
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_begin[i]);
-          }
-
-          printf("], m_size=[");
-
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_size[i]);
-          }
-
-          printf("]\n");
-        }
-    };
-
-
-
-
-    template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    struct StaticTensorTile;
-
-    template< typename INDEX_TYPE,
-              TensorTileSize TENSOR_SIZE,
-              INDEX_TYPE... BeginInts,
-              INDEX_TYPE... SizeInts>
-    struct StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, BeginInts...>,
-              camp::int_seq<INDEX_TYPE, SizeInts...>>
-    {
-
-
-
-        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
-        using begin_type = StaticIndexArray<begin_seq>;
-        using size_type  = StaticIndexArray<size_seq >;
-        using self_type  = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq,size_seq>;
-        using index_type = INDEX_TYPE;
-
-        using nonstatic_self_type = TensorTile<INDEX_TYPE,TENSOR_SIZE,sizeof...(BeginInts)>;
-
-        using Partial = StaticTensorTile< INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>; 
-        using Full    = StaticTensorTile< INDEX_TYPE, TENSOR_FULL   , begin_seq, size_seq>; 
-
-        begin_type m_begin;
-        size_type  m_size;
-
-	static_assert(
-          sizeof...(BeginInts) == sizeof...(SizeInts),
-          "Mismatch between number of elements in Begin and Size series of StaticTensorTile"
-        );
-
-        static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
-        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
-
-        constexpr operator nonstatic_self_type() const {
-            return nonstatic_self_type { {BeginInts...}, {SizeInts...} };
-        }
-
-        constexpr nonstatic_self_type nonstatic() const {
-            return *this;
-        }
-        
-        template<TensorTileSize S>
-        constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const RAJA_UNUSED_ARG(&c)) const
-        {}
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("StaticTensorTile: dims=%d, m_begin=",  (int)s_num_dims);
-
-          m_begin.print();
-
-          printf(", m_size=");
-          
-          m_size.print();
-
-          printf("\n");
-        }
-    };
-
-        template< typename TILE, typename VALUE, size_t IDX>
-        struct SetStaticTensorTileBegin;
-
-        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
-        struct SetStaticTensorTileBegin<
-              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
-              camp::integral_constant<INDEX_TYPE,VALUE>,
-              IDX
-        > {
-            using BeginType = StaticIndexArray<TBEGIN>;
-            using Type = StaticTensorTile<
-                INDEX_TYPE,
-                TENSOR_SIZE,
-                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,BeginType>::Seq,
-                TSIZE
-            >;
-        };
-
-        template< typename TILE, typename VALUE, size_t IDX>
-        struct SetStaticTensorTileSize;
-
-        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
-        struct SetStaticTensorTileSize<
-              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
-              camp::integral_constant<INDEX_TYPE,VALUE>,
-              IDX
-        > {
-            using SizeType = StaticIndexArray<TSIZE>;
-            using Type = StaticTensorTile<
-                INDEX_TYPE,
-                TENSOR_SIZE,
-                TBEGIN,
-                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,SizeType>::Seq
-            >;
-        };
+template <typename INT_SEQ>
+struct StaticIndexArray;
 
+template <typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
+struct PrependStaticIndexArray;
 
+template <typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
+struct AddStaticIndexArray;
 
+template <typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
+struct SetStaticIndexArray;
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS, camp::idx_t STRIDE_ONE_DIM = -1>
-    struct TensorRef
-    {
-        static constexpr camp::idx_t    s_stride_one_dim = STRIDE_ONE_DIM;
-        static constexpr camp::idx_t    s_num_dims       = NUM_DIMS;
-        static constexpr TensorTileSize s_tensor_size    = TENSOR_SIZE;
 
-        using self_type = TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, NUM_DIMS, STRIDE_ONE_DIM>;
-        using tile_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-        using pointer_type = POINTER_TYPE;
-        using index_type = INDEX_TYPE;
-        
+template <typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+struct StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>> {
 
-        pointer_type m_pointer;
-        index_type m_stride[NUM_DIMS];
-        tile_type m_tile;
+  using seq_type = camp::int_seq<INDEX_TYPE, HEAD, TAIL...>;
+  using Self = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Tail = StaticIndexArray<camp::int_seq<INDEX_TYPE, TAIL...>>;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS, m_pointer);
+  Tail tail;
 
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_stride[i]);
-          }
-
-          printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
-
-          m_tile.print();
-        }
+  RAJA_INLINE
+  StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>() = default;
 
-    };
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr INDEX_TYPE value_at(size_t index)
+  {
+    if (index == 0) {
+      return HEAD;
+    } else {
+      return Tail::value_at(index - 1);
+    }
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr INDEX_TYPE operator[](size_t index) const
+  {
+    if (index == 0) {
+      return HEAD;
+    } else {
+      return tail[index - 1];
+    }
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print_values() const
+  {
+    printf("%ld ", (long)HEAD);
+    tail.print_values();
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("[");
+    print_values();
+    printf("]");
+  }
+};
+
+template <typename INDEX_TYPE>
+struct StaticIndexArray<camp::int_seq<INDEX_TYPE>> {
+
+  using seq_type = camp::int_seq<INDEX_TYPE>;
+
+  RAJA_INLINE
+  StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr INDEX_TYPE value_at(size_t) { return 0; }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr INDEX_TYPE operator[](size_t) const { return 0; }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print_values() const {}
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const { print("[]"); }
+};
+
+template <typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
+struct PrependStaticIndexArray<
+    INDEX_TYPE,
+    NEW_HEAD,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, ORIG_INTS...>>> {
+  using Type =
+      StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
+  using Seq = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
+};
+
+
+template <typename INDEX_TYPE,
+          size_t IDX,
+          INDEX_TYPE DELTA,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct AddStaticIndexArray<
+    INDEX_TYPE,
+    IDX,
+    DELTA,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>> {
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using AddTail = typename AddStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               DELTA,
+                                               typename Orig::Tail>::Type;
+  using Type =
+      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Seq;
+};
+
+template <typename INDEX_TYPE,
+          INDEX_TYPE DELTA,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct AddStaticIndexArray<
+    INDEX_TYPE,
+    0,
+    DELTA,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>> {
+
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                HEAD + DELTA,
+                                                typename Orig::Tail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               HEAD + DELTA,
+                                               typename Orig::Tail>::Seq;
+};
+
+
+template <typename INDEX_TYPE,
+          size_t IDX,
+          INDEX_TYPE VALUE,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct SetStaticIndexArray<
+    INDEX_TYPE,
+    IDX,
+    VALUE,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>> {
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using SetTail = typename SetStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               VALUE,
+                                               typename Orig::Tail>::Type;
+  using Type =
+      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Seq;
+};
+
+template <typename INDEX_TYPE,
+          INDEX_TYPE VALUE,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct SetStaticIndexArray<
+    INDEX_TYPE,
+    0,
+    VALUE,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>> {
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                VALUE,
+                                                typename Orig::Tail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               VALUE,
+                                               typename Orig::Tail>::Seq;
+};
+
+
+enum TensorTileSize {
+  TENSOR_PARTIAL,  // the tile is a full TensorRegister
+  TENSOR_FULL,     // the tile is a partial TensorRegister
+  TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
+};
+
+template <typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
+struct TensorTile {
+  using self_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using nonstatic_self_type = self_type;
+  using index_type = INDEX_TYPE;
+  index_type m_begin[NUM_DIMS];
+  index_type m_size[NUM_DIMS];
+
+  static constexpr camp::idx_t s_num_dims = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+
+
+  template <typename I, TensorTileSize S>
+  void copy(TensorTile<I, S, NUM_DIMS> const &c)
+  {
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i) {
+      m_begin[i] = c.m_begin[i];
+      m_size[i] = c.m_size[i];
+    }
+  }
+
+  /*!
+   * Subtract begin offsets of two tiles.
+   *
+   * The resulting tile has the sizes of the left operand, but has
+   * m_begin[i] = left.m_begin[i] - right.m_begin[i]
+   *
+   */
+  template <typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type
+  operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const &sub) const
+  {
+    self_type result(*this);
+    for (camp::idx_t i = 0; i < s_num_dims; ++i) {
+      result.m_begin[i] -= sub.m_begin[i];
+    }
+    return result;
+  }
 
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename STRIDE_TYPE, typename BEGIN_TYPE, typename SIZE_TYPE, camp::idx_t STRIDE_ONE_DIM = -1>
-    struct StaticTensorRef;
-
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, INDEX_TYPE... StrideInts, INDEX_TYPE... BeginInts, INDEX_TYPE... SizeInts, camp::idx_t STRIDE_ONE_DIM>
-    struct StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInts...>,camp::int_seq<INDEX_TYPE,BeginInts...>,camp::int_seq<INDEX_TYPE,SizeInts...>,STRIDE_ONE_DIM>
-    {
-
-        static constexpr camp::idx_t    s_num_dims         = sizeof...(BeginInts);
-        static constexpr camp::idx_t    s_stride_one_dim   = STRIDE_ONE_DIM;
-        static constexpr TensorTileSize s_ref_tensor_size  = TENSOR_SIZE;
-        using pointer_type = POINTER_TYPE;
-        using index_type = INDEX_TYPE;
-        
-        using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
-        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
-
-        using stride_type  = StaticIndexArray<stride_seq>;
-
-	static_assert(
-          (sizeof...(BeginInts) == sizeof...(SizeInts)) && (sizeof...(SizeInts) == sizeof...(StrideInts)),
-          "Mismatch between number of elements in Begin and Size series of StaticTensorRef"
-        );
-        
-
-        using self_type = StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,stride_seq,begin_seq,size_seq>;
-        using tile_type = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
-
-
-        pointer_type m_pointer;
-        stride_type m_stride;
-        tile_type m_tile;
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims, m_pointer);
-
-          m_stride.print();
-
-          printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
-
-          m_tile.print();
-        }
-
-    };
-
-
-
-
-    template<typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
-    struct MergeRefTile;
-
-    template<typename REF_TYPE, typename TILE_TYPE, camp::idx_t ... DIM_SEQ>
-    struct MergeRefTile <REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>> {
-
-        static_assert( REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims , "Merging a ref with a tile requires an equivalent number of dimensions.");
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("TensorTile: dims=%d, m_begin=[", (int)NUM_DIMS);
 
-        static constexpr camp::idx_t    s_num_dims         = REF_TYPE::s_num_dims;
-        static constexpr camp::idx_t    s_stride_one_dim   = REF_TYPE::s_stride_one_dim;
-        static constexpr TensorTileSize s_ref_tensor_size  = TILE_TYPE::s_tensor_size;
-        using pointer_type    = typename REF_TYPE::pointer_type;
-        using ref_index_type  = typename REF_TYPE::index_type;
-        
-        static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
-        using tile_index_type = typename TILE_TYPE::index_type;
-
-        using merge_type = TensorRef<pointer_type, tile_index_type, s_tile_tensor_size, s_num_dims, s_stride_one_dim>;
-        using shift_type = merge_type;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        merge_type merge(REF_TYPE const &ref, TILE_TYPE const &tile){
-          return merge_type{
-            ref.m_pointer,
-            {tile_index_type(ref.m_stride[DIM_SEQ])...},
-            tile
-          };
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        shift_type shift_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin){
-          return shift_type{
-            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
-            {tile_index_type(ref.m_stride[DIM_SEQ])...},
-            ref.m_tile
-          };
-        }
-
-    };
-
-
-
-
-
-
-
-    template<
-       typename POINTER_TYPE, typename INDEX_TYPE1, TensorTileSize RTENSOR_SIZE,
-       typename STRIDE, INDEX_TYPE1... BEGIN1, INDEX_TYPE1... SIZE1, camp::idx_t STRIDE_ONE_DIM,
-       typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE, typename BEGIN2, typename SIZE2,
-       camp::idx_t ... DIM_SEQ
-    >
-    struct MergeRefTile<
-       StaticTensorRef<
-              POINTER_TYPE, INDEX_TYPE1, RTENSOR_SIZE,
-              STRIDE,
-              camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-              camp::int_seq<INDEX_TYPE1,SIZE1...>,
-              STRIDE_ONE_DIM
-       >,
-       StaticTensorTile<
-              INDEX_TYPE2,
-              TENSOR_SIZE,
-              BEGIN2,
-              SIZE2
-       >,
-       camp::idx_seq<DIM_SEQ...>
-    > {
-
-        using ref_tile_type = StaticTensorTile<
-                  INDEX_TYPE1,
-                  RTENSOR_SIZE,
-                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-                  camp::int_seq<INDEX_TYPE1, SIZE1...>
-              >;
-
-        using ref_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE1,
-                  RTENSOR_SIZE,
-                  STRIDE,
-                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-                  camp::int_seq<INDEX_TYPE1, SIZE1...>,
-                  STRIDE_ONE_DIM
-              >;
-
-        using tile_type = StaticTensorTile<
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  BEGIN2,
-                  SIZE2
-              >;
-
-        using ref_stride_type = typename ref_type ::stride_type;
-
-        using new_stride_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>; 
-        
-        using shift_begin_seq = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(BEGIN1)...>; 
-        using shift_size_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(SIZE1)...>; 
-       
-        using shift_tile_type = StaticTensorTile<INDEX_TYPE2,TENSOR_SIZE,shift_begin_seq,shift_size_seq>;
- 
-        using new_stride_type = StaticIndexArray<new_stride_seq>; 
-
-        using merge_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  new_stride_seq,
-                  BEGIN2,
-                  SIZE2,
-                  STRIDE_ONE_DIM
-              >;
-
-        using shift_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  new_stride_seq,
-                  shift_begin_seq,
-                  shift_size_seq,
-                  STRIDE_ONE_DIM
-              >;
-
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        merge_type merge(ref_type const &ref, tile_type const &tile){
-          return merge_type {
-            ref.m_pointer,
-            new_stride_type(),
-            tile
-          };
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        shift_type shift_origin(ref_type const &ref, tile_type const &tile_origin){
-          return shift_type {
-            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
-            new_stride_type(),
-            shift_tile_type()
-          };
-        }
-
-
-
-    };
-
-
-
-
-    template<typename REF_TYPE, typename TILE_TYPE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto merge_ref_tile(REF_TYPE const &ref, TILE_TYPE const &tile) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
-    {
-      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref, tile);
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i) {
+      printf("%ld ", (long)m_begin[i]);
     }
 
+    printf("], m_size=[");
 
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i) {
+      printf("%ld ", (long)m_size[i]);
+    }
 
-    /*!
-     * Modifies a ref's pointer so that the supplied tile_origin will resolve
-     * to the original pointer.
-     */
-    template<typename REF_TYPE, typename TILE_TYPE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto shift_tile_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
-    {
-      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref, tile_origin);
+    printf("]\n");
+  }
+};
+
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+struct StaticTensorTile;
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE... BeginInts,
+          INDEX_TYPE... SizeInts>
+struct StaticTensorTile<INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE, BeginInts...>,
+                        camp::int_seq<INDEX_TYPE, SizeInts...>> {
+
+
+  using begin_seq = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq = camp::int_seq<INDEX_TYPE, SizeInts...>;
+  using begin_type = StaticIndexArray<begin_seq>;
+  using size_type = StaticIndexArray<size_seq>;
+  using self_type =
+      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
+  using index_type = INDEX_TYPE;
+
+  using nonstatic_self_type =
+      TensorTile<INDEX_TYPE, TENSOR_SIZE, sizeof...(BeginInts)>;
+
+  using Partial =
+      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>;
+  using Full = StaticTensorTile<INDEX_TYPE, TENSOR_FULL, begin_seq, size_seq>;
+
+  begin_type m_begin;
+  size_type m_size;
+
+  static_assert(sizeof...(BeginInts) == sizeof...(SizeInts),
+                "Mismatch between number of elements in Begin and Size series "
+                "of StaticTensorTile");
+
+  static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+
+  constexpr operator nonstatic_self_type() const
+  {
+    return nonstatic_self_type{{BeginInts...}, {SizeInts...}};
+  }
+
+  constexpr nonstatic_self_type nonstatic() const { return *this; }
+
+  template <TensorTileSize S>
+  constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const
+                          RAJA_UNUSED_ARG(&c)) const
+  {
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("StaticTensorTile: dims=%d, m_begin=", (int)s_num_dims);
+
+    m_begin.print();
+
+    printf(", m_size=");
+
+    m_size.print();
+
+    printf("\n");
+  }
+};
+
+template <typename TILE, typename VALUE, size_t IDX>
+struct SetStaticTensorTileBegin;
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          INDEX_TYPE VALUE,
+          size_t IDX>
+struct SetStaticTensorTileBegin<
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
+    camp::integral_constant<INDEX_TYPE, VALUE>,
+    IDX> {
+  using BeginType = StaticIndexArray<TBEGIN>;
+  using Type = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, BeginType>::Seq,
+      TSIZE>;
+};
+
+template <typename TILE, typename VALUE, size_t IDX>
+struct SetStaticTensorTileSize;
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          INDEX_TYPE VALUE,
+          size_t IDX>
+struct SetStaticTensorTileSize<
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
+    camp::integral_constant<INDEX_TYPE, VALUE>,
+    IDX> {
+  using SizeType = StaticIndexArray<TSIZE>;
+  using Type = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      TBEGIN,
+      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, SizeType>::Seq>;
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          camp::idx_t NUM_DIMS,
+          camp::idx_t STRIDE_ONE_DIM = -1>
+struct TensorRef {
+  static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
+  static constexpr camp::idx_t s_num_dims = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+
+  using self_type = TensorRef<POINTER_TYPE,
+                              INDEX_TYPE,
+                              TENSOR_SIZE,
+                              NUM_DIMS,
+                              STRIDE_ONE_DIM>;
+  using tile_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using pointer_type = POINTER_TYPE;
+  using index_type = INDEX_TYPE;
+
+
+  pointer_type m_pointer;
+  index_type m_stride[NUM_DIMS];
+  tile_type m_tile;
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[",
+           (int)NUM_DIMS,
+           m_pointer);
+
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i) {
+      printf("%ld ", (long)m_stride[i]);
     }
 
+    printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+
+    m_tile.print();
+  }
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename STRIDE_TYPE,
+          typename BEGIN_TYPE,
+          typename SIZE_TYPE,
+          camp::idx_t STRIDE_ONE_DIM = -1>
+struct StaticTensorRef;
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE... StrideInts,
+          INDEX_TYPE... BeginInts,
+          INDEX_TYPE... SizeInts,
+          camp::idx_t STRIDE_ONE_DIM>
+struct StaticTensorRef<POINTER_TYPE,
+                       INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, StrideInts...>,
+                       camp::int_seq<INDEX_TYPE, BeginInts...>,
+                       camp::int_seq<INDEX_TYPE, SizeInts...>,
+                       STRIDE_ONE_DIM> {
+
+  static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
+  static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
+  static constexpr TensorTileSize s_ref_tensor_size = TENSOR_SIZE;
+  using pointer_type = POINTER_TYPE;
+  using index_type = INDEX_TYPE;
+
+  using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
+  using begin_seq = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq = camp::int_seq<INDEX_TYPE, SizeInts...>;
+
+  using stride_type = StaticIndexArray<stride_seq>;
+
+  static_assert((sizeof...(BeginInts) == sizeof...(SizeInts)) &&
+                    (sizeof...(SizeInts) == sizeof...(StrideInts)),
+                "Mismatch between number of elements in Begin and Size series "
+                "of StaticTensorRef");
+
+
+  using self_type = StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE,
+                                    TENSOR_SIZE,
+                                    stride_seq,
+                                    begin_seq,
+                                    size_seq>;
+  using tile_type =
+      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
+
+
+  pointer_type m_pointer;
+  stride_type m_stride;
+  tile_type m_tile;
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=",
+           (int)s_num_dims,
+           m_pointer);
+
+    m_stride.print();
+
+    printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+
+    m_tile.print();
+  }
+};
+
+
+template <typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
+struct MergeRefTile;
+
+template <typename REF_TYPE, typename TILE_TYPE, camp::idx_t... DIM_SEQ>
+struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>> {
+
+  static_assert(REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims,
+                "Merging a ref with a tile requires an equivalent number of "
+                "dimensions.");
+
+  static constexpr camp::idx_t s_num_dims = REF_TYPE::s_num_dims;
+  static constexpr camp::idx_t s_stride_one_dim = REF_TYPE::s_stride_one_dim;
+  static constexpr TensorTileSize s_ref_tensor_size = TILE_TYPE::s_tensor_size;
+  using pointer_type = typename REF_TYPE::pointer_type;
+  using ref_index_type = typename REF_TYPE::index_type;
+
+  static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
+  using tile_index_type = typename TILE_TYPE::index_type;
+
+  using merge_type = TensorRef<pointer_type,
+                               tile_index_type,
+                               s_tile_tensor_size,
+                               s_num_dims,
+                               s_stride_one_dim>;
+  using shift_type = merge_type;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr merge_type merge(REF_TYPE const &ref, TILE_TYPE const &tile)
+  {
+    return merge_type{ref.m_pointer,
+                      {tile_index_type(ref.m_stride[DIM_SEQ])...},
+                      tile};
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr shift_type shift_origin(REF_TYPE const &ref,
+                                           TILE_TYPE const &tile_origin)
+  {
+    return shift_type{ref.m_pointer -
+                          RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                  ref.m_stride[DIM_SEQ])...),
+                      {tile_index_type(ref.m_stride[DIM_SEQ])...},
+                      ref.m_tile};
+  }
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE1,
+          TensorTileSize RTENSOR_SIZE,
+          typename STRIDE,
+          INDEX_TYPE1... BEGIN1,
+          INDEX_TYPE1... SIZE1,
+          camp::idx_t STRIDE_ONE_DIM,
+          typename INDEX_TYPE2,
+          TensorTileSize TENSOR_SIZE,
+          typename BEGIN2,
+          typename SIZE2,
+          camp::idx_t... DIM_SEQ>
+struct MergeRefTile<StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE1,
+                                    RTENSOR_SIZE,
+                                    STRIDE,
+                                    camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                    camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                    STRIDE_ONE_DIM>,
+                    StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>,
+                    camp::idx_seq<DIM_SEQ...>> {
+
+  using ref_tile_type = StaticTensorTile<INDEX_TYPE1,
+                                         RTENSOR_SIZE,
+                                         camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                         camp::int_seq<INDEX_TYPE1, SIZE1...>>;
+
+  using ref_type = StaticTensorRef<POINTER_TYPE,
+                                   INDEX_TYPE1,
+                                   RTENSOR_SIZE,
+                                   STRIDE,
+                                   camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                   camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                   STRIDE_ONE_DIM>;
+
+  using tile_type = StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>;
+
+  using ref_stride_type = typename ref_type ::stride_type;
+
+  using new_stride_seq =
+      camp::int_seq<INDEX_TYPE2,
+                    INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>;
+
+  using shift_begin_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(BEGIN1)...>;
+  using shift_size_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(SIZE1)...>;
+
+  using shift_tile_type = StaticTensorTile<INDEX_TYPE2,
+                                           TENSOR_SIZE,
+                                           shift_begin_seq,
+                                           shift_size_seq>;
+
+  using new_stride_type = StaticIndexArray<new_stride_seq>;
+
+  using merge_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     BEGIN2,
+                                     SIZE2,
+                                     STRIDE_ONE_DIM>;
+
+  using shift_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     shift_begin_seq,
+                                     shift_size_seq,
+                                     STRIDE_ONE_DIM>;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr merge_type merge(ref_type const &ref, tile_type const &tile)
+  {
+    return merge_type{ref.m_pointer, new_stride_type(), tile};
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr shift_type shift_origin(ref_type const &ref,
+                                           tile_type const &tile_origin)
+  {
+    return shift_type{ref.m_pointer -
+                          RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                  ref.m_stride[DIM_SEQ])...),
+                      new_stride_type(),
+                      shift_tile_type()};
+  }
+};
+
+
+template <typename REF_TYPE, typename TILE_TYPE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto merge_ref_tile(
+    REF_TYPE const &ref,
+    TILE_TYPE const &tile) ->
+    typename MergeRefTile<
+        REF_TYPE,
+        TILE_TYPE,
+        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
+{
+  return MergeRefTile<REF_TYPE,
+                      TILE_TYPE,
+                      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref,
+                                                                          tile);
+}
 
 
-    /*!
-     * Changes TensorTile size type to FULL
-     */
-    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &
-    make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
-      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &>(tile);
-    }
+/*!
+ * Modifies a ref's pointer so that the supplied tile_origin will resolve
+ * to the original pointer.
+ */
+template <typename REF_TYPE, typename TILE_TYPE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto shift_tile_origin(
+    REF_TYPE const &ref,
+    TILE_TYPE const &tile_origin) ->
+    typename MergeRefTile<
+        REF_TYPE,
+        TILE_TYPE,
+        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
+{
+  return MergeRefTile<
+      REF_TYPE,
+      TILE_TYPE,
+      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref,
+                                                                 tile_origin);
+}
 
-    /*!
-     * Changes TensorTile size type to PARTIAL
-     */
-    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &
-    make_tensor_tile_partial(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
-      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &>(tile);
-    }
 
+/*!
+ * Changes TensorTile size type to FULL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          camp::idx_t NUM_DIMS>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE,
+                                                  TENSOR_FULL,
+                                                  NUM_DIMS>
+    &make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile)
+{
+  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &>(
+      tile);
+}
 
+/*!
+ * Changes TensorTile size type to PARTIAL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          camp::idx_t NUM_DIMS>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE,
+                                                  TENSOR_PARTIAL,
+                                                  NUM_DIMS>
+    &make_tensor_tile_partial(
+        TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile)
+{
+  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &>(
+      tile);
+}
 
-    /*!
-     * Changes StaticTensorTile size type to FULL
-     */
-    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &
-    make_tensor_tile_full(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
-      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &>(tile);
-    }
 
-    /*!
-     * Changes StaticTensorTile size type to PARTIAL
-     */
-    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &
-    make_tensor_tile_partial(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
-      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &>(tile);
-    }
+/*!
+ * Changes StaticTensorTile size type to FULL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_FULL,
+                                                        TBEGIN,
+                                                        TSIZE>
+    &make_tensor_tile_full(
+        StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile)
+{
+  return reinterpret_cast<
+      StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &>(tile);
+}
 
+/*!
+ * Changes StaticTensorTile size type to PARTIAL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_PARTIAL,
+                                                        TBEGIN,
+                                                        TSIZE>
+    &make_tensor_tile_partial(
+        StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile)
+{
+  return reinterpret_cast<
+      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &>(tile);
+}
 
 
-  } // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
index d2bce598ff..70fcdf88fb 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
@@ -19,12 +19,10 @@
 #define RAJA_pattern_tensor_TensorRegisterBase_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
-#include "camp/camp.hpp"
 #include "RAJA/pattern/tensor/TensorLayout.hpp"
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
+#include "RAJA/util/macros.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -34,815 +32,781 @@ namespace expt
 {
 
 
+namespace ET
+{
+class TensorExpressionConcreteBase;
+}  // namespace ET
+
+
+template <typename TENSOR, camp::idx_t DIM>
+struct TensorDimSize {
+  static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
+};
+
+/*
+ * Tensor product helper class.
+ *
+ * This defines the default product operation between types when using the
+ * operator*
+ *
+ */
+template <typename LHS, typename RHS>
+struct TensorDefaultOperation {
+
+  using multiply_type = decltype(LHS().multiply(RHS()));
+
+  // default multiplication operator
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static multiply_type multiply(LHS const &lhs, RHS const &rhs)
+  {
+    return lhs.multiply(rhs);
+  }
+};
+
+
+template <typename REF_TYPE>
+struct TensorRegisterStoreRef {
+  using self_type = TensorRegisterStoreRef<REF_TYPE>;
+  REF_TYPE m_ref;
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator=(RHS const &rhs)
+  {
+
+    rhs.store_ref(m_ref);
+    return *this;
+  }
+};
+
+template <camp::idx_t N, camp::idx_t D>
+struct DivideRoundUp {
+  static constexpr camp::idx_t value = (N % D) > 0 ? (1 + N / D) : (N / D);
+};
+
+
+class TensorRegisterConcreteBase
+{
+};
+
+/*!
+ * TensorRegister base class that provides some default behaviors and simplifies
+ * the implementation of new register types.
+ *
+ * This uses CRTP to provide static polymorphism
+ */
+template <typename Derived>
+class TensorRegisterBase;
+
+template <typename REGISTER_POLICY,
+          typename T,
+          typename LAYOUT,
+          typename camp::idx_t... SIZES>
+class TensorRegisterBase<
+    RAJA::expt::
+        TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>>
+    : public TensorRegisterConcreteBase
+{
+public:
+  using self_type = RAJA::expt::
+      TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
+  using element_type = camp::decay<T>;
+
+  static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
+
+  static constexpr camp::idx_t s_num_registers =
+      DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...),
+                    RegisterTraits<REGISTER_POLICY, T>::s_num_elem>::value;
+
+  using index_type = camp::idx_t;
 
+  using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
 
+  using register_policy = REGISTER_POLICY;
 
-  namespace ET
+private:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type *getThis() { return static_cast<self_type *>(this); }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr self_type const *getThis() const
   {
-    class TensorExpressionConcreteBase;
-  } // namespace ET
+    return static_cast<self_type const *>(this);
+  }
+
+protected:
+  register_type m_registers[s_num_registers];
+
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegisterBase() {}
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegisterBase(element_type c) { broadcast(c); }
 
 
-  template<typename TENSOR, camp::idx_t DIM>
-  struct TensorDimSize{
-      static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
-  };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegisterBase(self_type const &c) { copy(c); }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~TensorRegisterBase() {}
+
 
   /*
-   * Tensor product helper class.
-   *
-   * This defines the default product operation between types when using the
-   * operator*
-   *
+   * Overload for:    assignment of ET to a TensorRegister
    */
-  template<typename LHS, typename RHS>
-  struct TensorDefaultOperation{
+  template <typename RHS,
+            typename std::enable_if<
+                std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value,
+                bool>::type = true>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorRegisterBase(RHS const &rhs)
+  {
+    // evaluate a single tile of the ET, storing in this TensorRegister
+    *this = rhs.eval(self_type::s_get_default_tile());
+  }
 
-      using multiply_type = decltype(LHS().multiply(RHS()));
 
-      // default multiplication operator
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      multiply_type multiply(LHS const &lhs, RHS const &rhs)
-      {
-        return lhs.multiply(rhs);
-      }
+  template <typename... REGS>
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegisterBase(register_type reg0,
+                                                           REGS const &...regs)
+      : m_registers{reg0, regs...}
+  {
+    static_assert(1 + sizeof...(REGS) == s_num_registers,
+                  "Incompatible number of registers");
+  }
 
-  };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return register_type::is_root(); }
 
 
-  template<typename REF_TYPE>
-  struct TensorRegisterStoreRef{
-      using self_type = TensorRegisterStoreRef<REF_TYPE>;
-      REF_TYPE m_ref;
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr TensorRegisterStoreRef<REF_TYPE>
+  create_et_store_ref(REF_TYPE const &ref)
+  {
+    return TensorRegisterStoreRef<REF_TYPE>{ref};
+  }
 
-      RAJA_SUPPRESS_HD_WARN
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator=(RHS const &rhs)
-      {
+  RAJA_SUPPRESS_HD_WARN
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE static self_type s_load_ref(REF_TYPE const &ref)
+  {
 
-        rhs.store_ref(m_ref);
-        return *this;
-      }
-  };
+    self_type value;
 
-  template<camp::idx_t N, camp::idx_t D>
-  struct DivideRoundUp {
-      static constexpr camp::idx_t value =
-          (N % D) > 0 ? (1 + N/D) : (N/D);
-  };
+    value.load_ref(ref);
+    return value;
+  }
 
+  /*!
+   * Gets the size of the tensor
+   * Since this is a vector, just the length of the vector in dim 0
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr int s_dim_elem(int dim)
+  {
+    return (dim == 0) ? self_type::s_num_elem : 0;
+  }
 
-  class TensorRegisterConcreteBase {};
 
   /*!
-   * TensorRegister base class that provides some default behaviors and simplifies
-   * the implementation of new register types.
-   *
-   * This uses CRTP to provide static polymorphism
+   * Gets the default tile of this tensor
+   * That tile always start at 0, and extends to the full tile sizes
+   */
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr StaticTensorTile<int,
+                                    TENSOR_FULL,
+                                    camp::int_seq<int, int(SIZES * 0)...>,
+                                    camp::int_seq<int, int(SIZES)...>>
+  s_get_default_tile()
+  {
+    return StaticTensorTile<int,
+                            TENSOR_FULL,
+                            camp::int_seq<int, int(SIZES * 0)...>,
+                            camp::int_seq<int, int(SIZES)...>>();
+  }
+
+  /*!
+   * @brief convenience routine to allow Vector classes to use
+   * camp::sink() across a variety of register types, and use things like
+   * ternary operators
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr bool sink() const { return false; }
+
+
+  /*!
+   * Copy contents of another tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &c)
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+      m_registers[i] = c.vec(i);
+    }
+    return *getThis();
+  }
+
+
+  /*!
+   * Sets all elements to zero
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &clear()
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+      m_registers[i] = register_type(0);
+    }
+
+
+    return *getThis();
+  }
+
+
+  /*!
+   * Copy contents of another matrix operator
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type v)
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+      m_registers[i].broadcast(v);
+    }
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Broadcast scalar value to first N register elements
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast_n(element_type const &value, camp::idx_t N)
+  {
+    for (camp::idx_t i = 0; i < N; ++i) {
+      getThis()->set(value, i);
+    }
+    return *getThis();
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.broadcast(getThis()->get(i));
+    return x;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &mat) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+      result.vec(i) = m_registers[i].add(mat.vec(i));
+    }
+    return result;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &mat) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+      result.vec(i) = m_registers[i].subtract(mat.vec(i));
+    }
+    return result;
+  }
+
+
+  /*!
+   * element-wise multiplication
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+      result.vec(i) = m_registers[i].multiply(x.vec(i));
+    }
+    return result;
+  }
+
+  /*!
+   * element-wise fused multiply add
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply_add(self_type const &x, self_type const &add) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+      result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
+    }
+    return result;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &mat) const
+  {
+    self_type result;
+    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg) {
+      result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
+    }
+    return result;
+  }
+
+
+  /*!
+   * @brief Dot product of two vectors
+   * @param x Other vector to dot with this vector
+   * @return Value of (*this) dot x
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type dot(self_type const &x) const
+  {
+    element_type result(0);
+
+    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg) {
+      result += m_registers[reg].multiply(x.vec(reg)).sum();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &operator=(element_type value)
+  {
+    getThis()->broadcast(value);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const &operator=(
+      RAJA::expt::TensorRegister<RAJA::expt::scalar_register,
+                                 T2,
+                                 RAJA::expt::ScalarLayout,
+                                 camp::idx_seq<>> const &value)
+  {
+    getThis()->broadcast(value.get(0));
+    return *getThis();
+  }
+
+  /*!
+   * @brief Assign one register to antoher
+   * @param x Vector to copy
+   * @return Value of (*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &operator=(self_type const &x)
+  {
+    getThis()->copy(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Add two vector registers
+   * @param x Vector to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(self_type const &x) const { return getThis()->add(x); }
+
+
+  /*!
+   * @brief Add a vector to this vector
+   * @param x Vector to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator+=(self_type const &x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Add vector to a scalar
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(element_type const &x) const { return getThis()->add(x); }
+
+
+  /*!
+   * @brief Add a scalar to this vector
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator+=(element_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Negate the value of this vector
+   * @return Value of -(*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-() const { return self_type(0).subtract(*getThis()); }
+
+  /*!
+   * @brief Subtract two vector registers
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(self_type const &x) const
+  {
+    return getThis()->subtract(x);
+  }
+
+  /*!
+   * @brief Subtract a vector from this vector
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator-=(self_type const &x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Subtract scalar from this register
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
    */
-  template<typename Derived>
-  class TensorRegisterBase;
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(element_type const &x) const
+  {
+    return getThis()->subtract(x);
+  }
 
-  template<typename REGISTER_POLICY, typename T, typename LAYOUT, typename camp::idx_t ... SIZES>
-  class TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>> :
-    public TensorRegisterConcreteBase
+  /*!
+   * @brief Subtract a scalar from this vector
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator-=(element_type const &x)
   {
-    public:
-      using self_type = RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
-      using element_type = camp::decay<T>;
-
-      static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
-      static constexpr camp::idx_t s_num_registers = DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...), RegisterTraits<REGISTER_POLICY,T>::s_num_elem>::value;
-
-      using index_type = camp::idx_t;
-
-      using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
-
-      using register_policy = REGISTER_POLICY;
-
-    private:
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type *getThis(){
-        return static_cast<self_type *>(this);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      self_type const *getThis() const{
-        return static_cast<self_type const *>(this);
-      }
-
-    protected:
-
-      register_type m_registers[s_num_registers];
-
-    public:
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegisterBase(){}
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegisterBase(element_type c)
-      {
-        broadcast(c);
-      }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegisterBase(self_type const &c)
-      {
-        copy(c);
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~TensorRegisterBase(){}
-
-
-      /*
-       * Overload for:    assignment of ET to a TensorRegister
-       */
-      template<typename RHS,
-        typename std::enable_if<std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegisterBase(RHS const &rhs)
-      {
-        // evaluate a single tile of the ET, storing in this TensorRegister
-        *this = rhs.eval(self_type::s_get_default_tile());
-      }
-
-
-      template<typename ... REGS>
-      explicit
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegisterBase(register_type reg0, REGS const &... regs) :
-        m_registers{reg0, regs...}
-      {
-        static_assert(1+sizeof...(REGS) == s_num_registers,
-            "Incompatible number of registers");
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return register_type::is_root();
-      }
-
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      TensorRegisterStoreRef<REF_TYPE>
-      create_et_store_ref(REF_TYPE const &ref) {
-        return TensorRegisterStoreRef<REF_TYPE>{ref};
-      }
-
-      RAJA_SUPPRESS_HD_WARN
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      self_type
-      s_load_ref(REF_TYPE const &ref) {
-
-        self_type value;
-
-        value.load_ref(ref);
-        return value;
-      }
-
-      /*!
-       * Gets the size of the tensor
-       * Since this is a vector, just the length of the vector in dim 0
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr int s_dim_elem(int dim){
-        return (dim==0) ? self_type::s_num_elem : 0;
-      }
-
-
-      /*!
-       * Gets the default tile of this tensor
-       * That tile always start at 0, and extends to the full tile sizes
-       */
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>
-      s_get_default_tile()
-      {
-        return StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>();
-      }
-
-      /*!
-       * @brief convenience routine to allow Vector classes to use
-       * camp::sink() across a variety of register types, and use things like
-       * ternary operators
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      bool sink() const{
-        return false;
-      }
-
-
-
-
-
-
-      /*!
-       * Copy contents of another tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &c){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i] = c.vec(i);
-        }
-        return *getThis();
-      }
-
-
-
-
-      /*!
-       * Sets all elements to zero
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &clear(){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i] = register_type(0);
-        }
-
-
-        return *getThis();
-      }
-
-
-      /*!
-       * Copy contents of another matrix operator
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type v){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i].broadcast(v);
-        }
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Broadcast scalar value to first N register elements
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast_n(element_type const &value, camp::idx_t N){
-        for(camp::idx_t i = 0;i < N;++ i){
-          getThis()->set(value, i);
-        }
-        return *getThis();
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.broadcast(getThis()->get(i));
-        return x;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].add(mat.vec(i));
-        }
-        return result;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].subtract(mat.vec(i));
-        }
-        return result;
-      }
-
-
-      /*!
-       * element-wise multiplication
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].multiply(x.vec(i));
-        }
-        return result;
-      }
-
-      /*!
-       * element-wise fused multiply add
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply_add(self_type const &x, self_type const &add) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
-        }
-        return result;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-          result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
-        }
-        return result;
-      }
-
-
-
-      /*!
-       * @brief Dot product of two vectors
-       * @param x Other vector to dot with this vector
-       * @return Value of (*this) dot x
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type dot(self_type const &x) const
-      {
-        element_type result(0);
-
-        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-          result += m_registers[reg].multiply(x.vec(reg)).sum();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(element_type value)
-      {
-        getThis()->broadcast(value);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register, T2, RAJA::expt::ScalarLayout, camp::idx_seq<>> const &value)
-      {
-        getThis()->broadcast(value.get(0));
-        return *getThis();
-      }
-
-      /*!
-       * @brief Assign one register to antoher
-       * @param x Vector to copy
-       * @return Value of (*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(self_type const &x)
-      {
-        getThis()->copy(x);
-        return *getThis();
-      }
-
-
-
-
-
-      /*!
-       * @brief Add two vector registers
-       * @param x Vector to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(self_type const &x) const
-      {
-        return getThis()->add(x);
-      }
-
-
-      /*!
-       * @brief Add a vector to this vector
-       * @param x Vector to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(self_type const &x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Add vector to a scalar
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(element_type const &x) const
-      {
-        return getThis()->add(x);
-      }
-
-
-      /*!
-       * @brief Add a scalar to this vector
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(element_type x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Negate the value of this vector
-       * @return Value of -(*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-() const
-      {
-        return self_type(0).subtract(*getThis());
-      }
-
-      /*!
-       * @brief Subtract two vector registers
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(self_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
-
-      /*!
-       * @brief Subtract a vector from this vector
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(self_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Subtract scalar from this register
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(element_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
-
-      /*!
-       * @brief Subtract a scalar from this vector
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(element_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Multiply two vector registers, element wise
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
+  /*!
+   * @brief Multiply two vector registers, element wise
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE
       typename TensorDefaultOperation<self_type, RHS>::multiply_type
       operator*(RHS const &rhs) const
-      {
-        return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-      }
-
-      /*!
-       * @brief Multiply a vector with this vector
-       * @param x Vector to multiple with this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator*=(RHS const &rhs)
-      {
-        *getThis() = TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Divide two vector registers, element wise
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(self_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
-
-      /*!
-       * @brief Divide this vector by another vector
-       * @param x Vector to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(self_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Divide by a scalar, element wise
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(element_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
-
-      /*!
-       * @brief Divide this vector by another vector
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(element_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Returns element wise minimum value tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmin(self_type x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].vmin(x.vec(i));
-        }
-        return result;
-      }
-
-
-      /*!
-       * @brief Returns element wise maximum value tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmax(self_type x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].vmax(x.vec(i));
-        }
-        return result;
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type &vec(int i){
-        return m_registers[i];
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      register_type const &vec(int i) const{
-        return m_registers[i];
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type &get_register(int reg){
-        return m_registers[reg];
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      register_type const &get_register(int reg) const{
-        return m_registers[reg];
-      }
-
-
-
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return getThis()->multiply_add(b, -c);
-      }
-
-      /*!
-       * Multiply this tensor by a scalar value
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type scale(element_type c) const
-      {
-        return getThis()->multiply(self_type(c));
-      }
-
-
-      /*!
-       * In-place add operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_add(self_type x){
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place sbutract operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_subtract(self_type x){
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply(self_type x){
-        *getThis() = getThis()->multiply(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply-add operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply_add(self_type x, self_type y){
-        *getThis() = getThis()->multiply_add(x,y);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply-subtract operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply_subtract(self_type x, self_type y){
-        *getThis() = getThis()->multiply_subtract(x,y);
-        return *getThis();
-      }
-
-      /*!
-       * In-place divide operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_divide(self_type x){
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place scaling operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_scale(element_type x){
-        *getThis() = getThis()->scale(x);
-        return *getThis();
-      }
-
-  };
-
-} //namespace internal
-
-} // namespace expt
+  {
+    return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+  }
 
-}  // namespace RAJA
+  /*!
+   * @brief Multiply a vector with this vector
+   * @param x Vector to multiple with this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type &operator*=(RHS const &rhs)
+  {
+    *getThis() =
+        TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Divide two vector registers, element wise
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(self_type const &x) const { return getThis()->divide(x); }
+
+  /*!
+   * @brief Divide this vector by another vector
+   * @param x Vector to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator/=(self_type const &x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Divide by a scalar, element wise
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(element_type const &x) const
+  {
+    return getThis()->divide(x);
+  }
+
+  /*!
+   * @brief Divide this vector by another vector
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator/=(element_type const &x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Returns element wise minimum value tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmin(self_type x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+      result.vec(i) = m_registers[i].vmin(x.vec(i));
+    }
+    return result;
+  }
+
+
+  /*!
+   * @brief Returns element wise maximum value tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmax(self_type x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+      result.vec(i) = m_registers[i].vmax(x.vec(i));
+    }
+    return result;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type &vec(int i) { return m_registers[i]; }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr register_type const &vec(int i) const { return m_registers[i]; }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type &get_register(int reg) { return m_registers[reg]; }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr register_type const &get_register(int reg) const
+  {
+    return m_registers[reg];
+  }
+
+
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const &b, self_type const &c) const
+  {
+    return getThis()->multiply_add(b, -c);
+  }
+
+  /*!
+   * Multiply this tensor by a scalar value
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type scale(element_type c) const
+  {
+    return getThis()->multiply(self_type(c));
+  }
+
+
+  /*!
+   * In-place add operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type &inplace_add(self_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place sbutract operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type &inplace_subtract(self_type x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
+  /*!
+   * In-place multiply operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type &inplace_multiply(self_type x)
+  {
+    *getThis() = getThis()->multiply(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place multiply-add operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type &inplace_multiply_add(self_type x, self_type y)
+  {
+    *getThis() = getThis()->multiply_add(x, y);
+    return *getThis();
+  }
+
+  /*!
+   * In-place multiply-subtract operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type &inplace_multiply_subtract(self_type x, self_type y)
+  {
+    *getThis() = getThis()->multiply_subtract(x, y);
+    return *getThis();
+  }
+
+  /*!
+   * In-place divide operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type &inplace_divide(self_type x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place scaling operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type &inplace_scale(element_type x)
+  {
+    *getThis() = getThis()->scale(x);
+    return *getThis();
+  }
+};
+
+}  // namespace expt
+
+}  // namespace internal
+
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
index 3899a97118..eef18e9298 100644
--- a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
@@ -19,11 +19,9 @@
 #define RAJA_pattern_tensor_TensorTileExec_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
 #include "RAJA/pattern/tensor/stats.hpp"
+#include "RAJA/util/macros.hpp"
 
 namespace RAJA
 {
@@ -33,345 +31,347 @@ namespace expt
 {
 
 
+template <typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+struct StaticTensorTileExec;
 
-    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-    struct StaticTensorTileExec;
-
-    template<typename STORAGE, typename DIM_SEQ>
-    struct TensorTileExec;
-
-    /**
-     * Implement a dimension tiling loop
-     */
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST>
-    struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>{
-
-      using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
-
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void exec(OTILE const &otile, TTYPE &tile, BODY && body){
-
-        auto const orig_begin = otile.m_begin[DIM0];
-        auto const orig_size =  otile.m_size[DIM0];
-
-        // Do the full tile sizes
-        for(tile.m_begin[DIM0] = orig_begin;
-
-            tile.m_begin[DIM0] +  STORAGE::s_dim_elem(DIM0) <=
-                orig_begin+orig_size;
-
-            tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0)){
-
-          // Do the next inner tiling loop
-          inner_t::exec(otile, tile, body);
-
-        }
-
-        // Postamble if needed
-        if(tile.m_begin[DIM0] <
-            orig_begin + orig_size)
-        {
-
-          // convert tile to a partial tile
-          auto &part_tile = make_tensor_tile_partial(tile);
-
-          // store original size
-          auto tmp_size = part_tile.m_size[DIM0];
-
-          // set tile size to the remainder
-          part_tile.m_size[DIM0] =
-              orig_begin +
-              orig_size -
-              tile.m_begin[DIM0];
-
-          // Do the next inner tiling loop
-          inner_t::exec(otile, part_tile, body);
-
-          // restore size
-          part_tile.m_size[DIM0] = tmp_size;
-        }
-
-        // reset tile dimension
-        tile.m_begin[DIM0] = orig_begin;
-
-      }
-
-
-
-      template<
-          typename OTILE,
-          typename TTYPE,
-          typename BODY
-      >
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void
-      static_exec(
-          OTILE const &otile,
-          TTYPE const &tile,
-          BODY && body
-      ){
-
-
-        auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-        auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
-
-        auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
-
-        auto constexpr step_size  = STORAGE::s_dim_elem(DIM0);
-
-        auto constexpr iter_count =
-               (tile_begin >= orig_begin) && (tile_begin < (orig_begin+orig_size))
-                 ? ((orig_begin + orig_size) - tile_begin + step_size - 1) / step_size
-                 : 0;
+template <typename STORAGE, typename DIM_SEQ>
+struct TensorTileExec;
 
+/**
+ * Implement a dimension tiling loop
+ */
+template <typename STORAGE, camp::idx_t DIM0, camp::idx_t... DIM_REST>
+struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>> {
 
-        using IterCount = camp::integral_constant<typename TTYPE::index_type,iter_count>;
-        using DimSeq = camp::idx_seq<DIM0,DIM_REST...>;
-        using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,IterCount>::type;
-
-        StaticTensorTileExec<STORAGE,DimSeq,IdxSeq>::exec(otile,tile,body);
-        
-      }
-
-
-
-    };
-
-
-    /**
-     * Termination of nested loop:  execute evaluation of ET
-     */
-    template<typename STORAGE>
-    struct TensorTileExec<STORAGE, camp::idx_seq<>>{
-
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void exec(OTILE &, TTYPE const &tile, BODY && body){
-
-        // execute body, passing in the current tile
-        body(tile);
-
-      }
-
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void static_exec(OTILE const &, TTYPE const &tile, BODY && body){
-
-        // execute body, passing in the current tile
-        body(tile);
-
-      }
-
-    };
-
-
-
-    template<typename STORAGE, typename TILE_TYPE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec_expanded(TILE_TYPE const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
-    {
-
-      // tile over full rows and columns
-      // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
-      TILE_TYPE tile {
-        {orig_tile.m_begin[IDX_SEQ]...},
-        {STORAGE::s_dim_elem(IDX_SEQ)...},
-      };
+  using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
 
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const &otile,
+                                                TTYPE &tile,
+                                                BODY &&body)
+  {
 
-      // Promote the tile type to a "full-tile" so that the full-element
-      // register operations are used.
-      // Any of the tiling loops can demote this to a partial-tile when
-      // they do postamble execution
-      auto &full_tile = make_tensor_tile_full(tile);
+    auto const orig_begin = otile.m_begin[DIM0];
+    auto const orig_size = otile.m_size[DIM0];
 
-      // Do all of the tiling loops in layout order, this may improve
-      // cache performance
-      using layout_order = typename STORAGE::layout_type::seq_t;
-      using tensor_tile_exec_t =
-             TensorTileExec<STORAGE, layout_order>;
+    // Do the full tile sizes
+    for (tile.m_begin[DIM0] = orig_begin;
 
+         tile.m_begin[DIM0] + STORAGE::s_dim_elem(DIM0) <=
+         orig_begin + orig_size;
 
-      tensor_tile_exec_t::exec(orig_tile, full_tile, body);
+         tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0)) {
 
+      // Do the next inner tiling loop
+      inner_t::exec(otile, tile, body);
     }
 
+    // Postamble if needed
+    if (tile.m_begin[DIM0] < orig_begin + orig_size) {
 
-    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-    struct StaticTensorTileExec;
-
-    /**
-     * Implement a dimension tiling loop
-     */
-
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST, camp::idx_t IDX, camp::idx_t ... IDX_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>,camp::idx_seq<IDX,IDX_REST...>>{
-
-          using DimList  = camp::idx_seq<DIM0, DIM_REST...>;
-          using DimTail  = camp::idx_seq<      DIM_REST...>;
-          using IdxList  = camp::idx_seq<IDX , IDX_REST...>;
-          using IdxTail  = camp::idx_seq<      IDX_REST...>;
-
-          using DownExec = TensorTileExec<STORAGE,camp::idx_seq<DIM_REST...>>;
-          using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0,DIM_REST...>,camp::idx_seq<IDX_REST...>>;
+      // convert tile to a partial tile
+      auto &part_tile = make_tensor_tile_partial(tile);
 
-          static auto const step_size = STORAGE::s_dim_elem(DIM0);
+      // store original size
+      auto tmp_size = part_tile.m_size[DIM0];
 
-          template<
-              typename OTILE,
-              typename TTYPE,
-              typename BODY
-          >
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static
-          void
-          exec(
-              OTILE const &otile,
-              TTYPE const &tile,
-              BODY && body
-          ){
-    
-            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
-    
-            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+      // set tile size to the remainder
+      part_tile.m_size[DIM0] = orig_begin + orig_size - tile.m_begin[DIM0];
 
-            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
-            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
+      // Do the next inner tiling loop
+      inner_t::exec(otile, part_tile, body);
 
-            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
+      // restore size
+      part_tile.m_size[DIM0] = tmp_size;
+    }
 
-            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
-            using PartTile  = typename TailTile::Partial;
+    // reset tile dimension
+    tile.m_begin[DIM0] = orig_begin;
+  }
 
-    
-            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec DOWN" );
-     
-            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
-               DownExec::static_exec(otile, tile, body);
-               NextTile next_tile;
-               NextExec::exec(otile, next_tile, body);
-            } else if ( tile_begin < (orig_begin + orig_size ) ) {
-               PartTile part_tile;
-               DownExec::static_exec(otile,part_tile,body);
-            }
-    
-          }
 
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void static_exec(OTILE const &otile,
+                                                       TTYPE const &tile,
+                                                       BODY &&body)
+  {
 
 
-    };
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
 
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
+    auto constexpr step_size = STORAGE::s_dim_elem(DIM0);
 
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t IDX, camp::idx_t ... IDX_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0>, camp::idx_seq<IDX,IDX_REST...>>{
-      using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0>,camp::idx_seq<IDX_REST...>>;
+    auto constexpr iter_count =
+        (tile_begin >= orig_begin) && (tile_begin < (orig_begin + orig_size))
+            ? ((orig_begin + orig_size) - tile_begin + step_size - 1) /
+                  step_size
+            : 0;
 
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static void exec(OTILE const & otile, TTYPE const &tile, BODY && body) {
-            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
-    
-            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+    using IterCount =
+        camp::integral_constant<typename TTYPE::index_type, iter_count>;
+    using DimSeq = camp::idx_seq<DIM0, DIM_REST...>;
+    using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,
+                                                  IterCount>::type;
 
-            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
-            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
+    StaticTensorTileExec<STORAGE, DimSeq, IdxSeq>::exec(otile, tile, body);
+  }
+};
 
-            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
 
-            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
-            using PartTile  = typename TailTile::Partial;
+/**
+ * Termination of nested loop:  execute evaluation of ET
+ */
+template <typename STORAGE>
+struct TensorTileExec<STORAGE, camp::idx_seq<>> {
+
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE &,
+                                                TTYPE const &tile,
+                                                BODY &&body)
+  {
+
+    // execute body, passing in the current tile
+    body(tile);
+  }
+
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void static_exec(OTILE const &,
+                                                       TTYPE const &tile,
+                                                       BODY &&body)
+  {
+
+    // execute body, passing in the current tile
+    body(tile);
+  }
+};
+
+
+template <typename STORAGE,
+          typename TILE_TYPE,
+          typename BODY,
+          camp::idx_t... IDX_SEQ,
+          camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
+    TILE_TYPE const &orig_tile,
+    BODY &&body,
+    camp::idx_seq<IDX_SEQ...> const &,
+    camp::idx_seq<DIM_SEQ...> const &)
+{
 
-    
-            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec ACROSS" );
-     
-            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
-               body(tile);
-               NextTile next_tile;
-               NextExec::exec(otile, next_tile, body);
-            } else if ( tile_begin < (orig_begin + orig_size ) ) {
-               PartTile part_tile;
-               body(part_tile);
-            }
-      }
+  // tile over full rows and columns
+  // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
+  TILE_TYPE tile{
+      {orig_tile.m_begin[IDX_SEQ]...},
+      {STORAGE::s_dim_elem(IDX_SEQ)...},
+  };
 
-    };
 
-    template<typename STORAGE, camp::idx_t ... DIM_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>, camp::idx_seq<> >{
+  // Promote the tile type to a "full-tile" so that the full-element
+  // register operations are used.
+  // Any of the tiling loops can demote this to a partial-tile when
+  // they do postamble execution
+  auto &full_tile = make_tensor_tile_full(tile);
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static void exec(OTILE const &, TTYPE const &, BODY &&) {}
+  // Do all of the tiling loops in layout order, this may improve
+  // cache performance
+  using layout_order = typename STORAGE::layout_type::seq_t;
+  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
-    };
 
+  tensor_tile_exec_t::exec(orig_tile, full_tile, body);
+}
 
 
-    template<typename STORAGE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec_expanded( StaticTensorTile<INDEX_TYPE,TENSOR_SIZE, TBEGIN, TSIZE> const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
-    {
-
-      using InputType = StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_SIZE,
-          TBEGIN,
-          TSIZE
-      >;
+template <typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+struct StaticTensorTileExec;
 
-      using InputBegin = typename InputType::begin_type;
+/**
+ * Implement a dimension tiling loop
+ */
 
-      using Type = StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_FULL,
-          camp::int_seq<INDEX_TYPE,InputBegin::value_at(IDX_SEQ)...>,
-          camp::int_seq<INDEX_TYPE,STORAGE::s_dim_elem(IDX_SEQ)...>
-      >;
+template <typename STORAGE,
+          camp::idx_t DIM0,
+          camp::idx_t... DIM_REST,
+          camp::idx_t IDX,
+          camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0, DIM_REST...>,
+                            camp::idx_seq<IDX, IDX_REST...>> {
+
+  using DimList = camp::idx_seq<DIM0, DIM_REST...>;
+  using DimTail = camp::idx_seq<DIM_REST...>;
+  using IdxList = camp::idx_seq<IDX, IDX_REST...>;
+  using IdxTail = camp::idx_seq<IDX_REST...>;
+
+  using DownExec = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0, DIM_REST...>,
+                                        camp::idx_seq<IDX_REST...>>;
+
+  static auto const step_size = STORAGE::s_dim_elem(DIM0);
+
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const &otile,
+                                                TTYPE const &tile,
+                                                BODY &&body)
+  {
+
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
+
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
+
+    using NextTile = typename expt::
+        SetStaticTensorTileBegin<TTYPE, NextBegin, (size_t)DIM0>::Type;
+
+    using TailTile = typename expt::
+        SetStaticTensorTileSize<TTYPE, TailSize, (size_t)DIM0>::Type;
+    using PartTile = typename TailTile::Partial;
+
+
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec DOWN");
+
+    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size)) {
+      DownExec::static_exec(otile, tile, body);
+      NextTile next_tile;
+      NextExec::exec(otile, next_tile, body);
+    } else if (tile_begin < (orig_begin + orig_size)) {
+      PartTile part_tile;
+      DownExec::static_exec(otile, part_tile, body);
+    }
+  }
+};
+
+
+template <typename STORAGE,
+          camp::idx_t DIM0,
+          camp::idx_t IDX,
+          camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0>,
+                            camp::idx_seq<IDX, IDX_REST...>> {
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0>,
+                                        camp::idx_seq<IDX_REST...>>;
+
+
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const &otile,
+                                                TTYPE const &tile,
+                                                BODY &&body)
+  {
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
+
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
+
+    using NextTile = typename expt::
+        SetStaticTensorTileBegin<TTYPE, NextBegin, (size_t)DIM0>::Type;
+
+    using TailTile = typename expt::
+        SetStaticTensorTileSize<TTYPE, TailSize, (size_t)DIM0>::Type;
+    using PartTile = typename TailTile::Partial;
+
+
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec ACROSS");
+
+    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size)) {
+      body(tile);
+      NextTile next_tile;
+      NextExec::exec(otile, next_tile, body);
+    } else if (tile_begin < (orig_begin + orig_size)) {
+      PartTile part_tile;
+      body(part_tile);
+    }
+  }
+};
+
+template <typename STORAGE, camp::idx_t... DIM_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM_REST...>,
+                            camp::idx_seq<>> {
+
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const &,
+                                                TTYPE const &,
+                                                BODY &&)
+  {
+  }
+};
+
+
+template <typename STORAGE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          typename BODY,
+          camp::idx_t... IDX_SEQ,
+          camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE> const &orig_tile,
+    BODY &&body,
+    camp::idx_seq<IDX_SEQ...> const &,
+    camp::idx_seq<DIM_SEQ...> const &)
+{
 
-      Type full_tile;
+  using InputType = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>;
 
-      // Do all of the tiling loops in layout order, this may improve
-      // cache performance
-      using layout_order = typename STORAGE::layout_type::seq_t;
-      using tensor_tile_exec_t =
-             TensorTileExec<STORAGE, layout_order>;
+  using InputBegin = typename InputType::begin_type;
 
+  using Type = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_FULL,
+      camp::int_seq<INDEX_TYPE, InputBegin::value_at(IDX_SEQ)...>,
+      camp::int_seq<INDEX_TYPE, STORAGE::s_dim_elem(IDX_SEQ)...>>;
 
-      tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
+  Type full_tile;
 
-    }
+  // Do all of the tiling loops in layout order, this may improve
+  // cache performance
+  using layout_order = typename STORAGE::layout_type::seq_t;
+  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
 
+  tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
+}
 
-    template<typename STORAGE, typename TILE_TYPE, typename BODY>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec(TILE_TYPE const &tile, BODY && body)
-    {
-      using layout_type = typename STORAGE::layout_type;
-      tensorTileExec_expanded<STORAGE>(tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims>{}, layout_type{});
-    }
 
-  } // namespace internal
-} // namespace expt
+template <typename STORAGE, typename TILE_TYPE, typename BODY>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec(TILE_TYPE const &tile,
+                                                 BODY &&body)
+{
+  using layout_type = typename STORAGE::layout_type;
+  tensorTileExec_expanded<STORAGE>(tile,
+                                   body,
+                                   camp::make_idx_seq_t<STORAGE::s_num_dims>{},
+                                   layout_type{});
+}
+
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
index 4ef4998fbe..d9d6edfcb0 100644
--- a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
@@ -19,13 +19,11 @@
 #define RAJA_pattern_tensor_VectorRegisterImpl_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/macros.hpp"
-
-#include "camp/camp.hpp"
 #include "RAJA/pattern/tensor/internal/TensorRegisterBase.hpp"
 #include "RAJA/pattern/tensor/stats.hpp"
 #include "RAJA/util/BitMask.hpp"
+#include "RAJA/util/macros.hpp"
+#include "camp/camp.hpp"
 
 
 namespace RAJA
@@ -34,958 +32,993 @@ namespace RAJA
 namespace expt
 {
 
-  /*!
-   * This provides a Tensor specialization for vectors
-   */
-  template<typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
-  class TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>> :
-    public internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>
+/*!
+ * This provides a Tensor specialization for vectors
+ */
+template <typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     RAJA::expt::VectorLayout,
+                     camp::idx_seq<SIZE>>
+    : public internal::expt::TensorRegisterBase<
+          RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                     T,
+                                     RAJA::expt::VectorLayout,
+                                     camp::idx_seq<SIZE>>>
+{
+public:
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   RAJA::expt::VectorLayout,
+                                   camp::idx_seq<SIZE>>;
+  using base_type = internal::expt::TensorRegisterBase<
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::VectorLayout,
+                                 camp::idx_seq<SIZE>>>;
+  using element_type = camp::decay<T>;
+  using layout_type = TensorLayout<0>;
+  using register_type = Register<T, REGISTER_POLICY>;
+
+  static constexpr camp::idx_t s_num_elem = SIZE;
+
+  using int_element_type =
+      typename register_type::int_vector_type::element_type;
+  using int_vector_type = TensorRegister<REGISTER_POLICY,
+                                         int_element_type,
+                                         RAJA::expt::VectorLayout,
+                                         camp::idx_seq<SIZE>>;
+
+private:
+  static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
+
+  static constexpr camp::idx_t s_num_full_registers =
+      s_num_elem / s_register_num_elem;
+
+  static constexpr camp::idx_t s_num_partial_lanes =
+      s_num_elem % s_register_num_elem;
+
+  static constexpr camp::idx_t s_num_registers = (s_num_partial_lanes > 0)
+                                                     ? s_num_full_registers + 1
+                                                     : s_num_full_registers;
+
+  using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
+
+  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
+
+  static constexpr camp::idx_t s_mask_per_register =
+      (1 << log_base2_t::value) - 1;
+
+  // Offset of last regiser in m_registers
+  static constexpr camp::idx_t s_final_register = s_num_partial_lanes == 0
+                                                      ? s_num_full_registers - 1
+                                                      : s_num_full_registers;
+
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX i) -> IDX
   {
-    public:
-      using self_type = TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
-      using base_type = internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>;
-      using element_type = camp::decay<T>;
-      using layout_type = TensorLayout<0>;
-      using register_type = Register<T, REGISTER_POLICY>;
-
-      static constexpr camp::idx_t s_num_elem = SIZE;
-
-      using int_element_type = typename register_type::int_vector_type::element_type;
-      using int_vector_type = TensorRegister<REGISTER_POLICY, int_element_type, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
-
-    private:
+    return i >> IDX(s_shift_per_register);
+  }
 
-      static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
-
-      static constexpr camp::idx_t s_num_full_registers = s_num_elem/s_register_num_elem;
-
-      static constexpr camp::idx_t s_num_partial_lanes =  s_num_elem%s_register_num_elem;
-
-      static constexpr camp::idx_t s_num_registers =
-          (s_num_partial_lanes > 0) ?
-              s_num_full_registers + 1 :
-              s_num_full_registers;
-
-      using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX i) -> IDX
+  {
+    return i & IDX(s_mask_per_register);
+  }
 
-      static constexpr camp::idx_t s_shift_per_register =
-          log_base2_t::value;
 
-      static constexpr camp::idx_t s_mask_per_register =
-          (1<<log_base2_t::value)-1;
+  using base_type::m_registers;
 
-      // Offset of last regiser in m_registers
-      static constexpr camp::idx_t s_final_register =
-          s_num_partial_lanes == 0 ?
-              s_num_full_registers-1 : s_num_full_registers;
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegister() {}
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_register(IDX i) -> IDX {
-        return i >> IDX(s_shift_per_register);
-      }
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_lane(IDX i) -> IDX {
-        return i & IDX(s_mask_per_register);
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegister(element_type c) { this->broadcast(c); }
 
 
-      using base_type::m_registers;
-
-    public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegister(self_type const &c) : base_type(c) {}
 
+  /*
+   * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
+   */
+  template <typename RHS,
+            typename std::enable_if<
+                std::is_base_of<
+                    RAJA::internal::expt::ET::TensorExpressionConcreteBase,
+                    RHS>::value,
+                bool>::type = true>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorRegister(RHS const &rhs)
+  {
+    // evaluate a single tile of the ET, storing in this
+    // RAJA::expt::TensorRegister
+    *this = rhs.eval(base_type::s_get_default_tile());
+  }
 
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegister(){}
+  template <typename... REGS>
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegister(register_type reg0,
+                                                       REGS const &...regs)
+      : base_type(reg0, regs...)
+  {
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return register_type::is_root(); }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(element_type c)
-      {
-        this->broadcast(c);
-      }
 
+  /*!
+   * Returns true if the underlying data packed for a given tensor ref
+   *
+   * This is true if either:
+   *   It's column major and the rows are stride one
+   *   It's row major and the columns are stride one
+   */
+  template <camp::idx_t STRIDE_ONE_DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
+  {
+    return STRIDE_ONE_DIM == 0;
+  }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(self_type const &c) :
-        base_type(c)
-      {
-      }
 
-      /*
-       * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
-       */
-      template<typename RHS,
-        typename std::enable_if<std::is_base_of<RAJA::internal::expt::ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(RHS const &rhs)
-      {
-        // evaluate a single tile of the ET, storing in this RAJA::expt::TensorRegister
-        *this = rhs.eval(base_type::s_get_default_tile());
-      }
+  /*!
+   * Gets the maximum size of matrix along specified dimension
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
+  {
+    return dim == 0 ? s_num_elem : 0;
+  }
 
 
-      template<typename ... REGS>
-      explicit
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(register_type reg0, REGS const &... regs) :
-        base_type(reg0, regs...)
-      {
-      }
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator=(element_type value)
+  {
+    this->broadcast(value);
+    return *this;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return register_type::is_root();
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator=(self_type const &c) { return this->copy(c); }
 
+  /*!
+   * Provide left vector-matrix multiply for operator* between
+   * this vector and a matrix
+   */
+  template <typename T2, typename L, typename RP>
+  self_type operator*(SquareMatrixRegister<T2, L, RP> const &y) const
+  {
+    return y.left_vector_multiply(*this);
+  }
 
-      /*!
-       * Returns true if the underlying data packed for a given tensor ref
-       *
-       * This is true if either:
-       *   It's column major and the rows are stride one
-       *   It's row major and the columns are stride one
-       */
-      template<camp::idx_t STRIDE_ONE_DIM>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_ref_packed() {
-        return STRIDE_ONE_DIM == 0;
-      }
 
+  template <typename REF_TYPE>
+  struct RefBridge;
 
-      /*!
-       * Gets the maximum size of matrix along specified dimension
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
-        return dim == 0 ? s_num_elem : 0;
-      }
 
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type &load_ref(REF_TYPE const &ref)
+  {
+    RefBridge<REF_TYPE>::load_ref(*this, ref);
+    return *this;
+  }
 
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
-      {
-        this->broadcast(value);
-        return *this;
-      }
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const &store_ref(REF_TYPE &ref) const
+  {
+    RefBridge<REF_TYPE>::store_ref(*this, ref);
+    return *this;
+  }
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<
+      RAJA::internal::expt::
+          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>> {
+
+    using RefType = RAJA::internal::expt::
+        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
+
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void load_ref(self_type &self, RefType const &ref)
+    {
+
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
+
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0) {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed++;
+#endif
+          self.load_packed(ptr);
+        }
+        // partial
+        else {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed_n++;
+#endif
+          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
+        }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        return this->copy(c);
       }
-
-      /*!
-       * Provide left vector-matrix multiply for operator* between
-       * this vector and a matrix
-       */
-      template<typename T2, typename L, typename RP>
-      self_type
-      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
-      {
-        return y.left_vector_multiply(*this);
+      // strided data
+      else {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided++;
+#endif
+          self.load_strided(ptr, ref.m_stride[0]);
+        }
+        // partial
+        else {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+        }
       }
+    }
 
 
-      template<typename REF_TYPE>
-      struct RefBridge;
-
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type& load_ref (REF_TYPE const &ref){
-          RefBridge<REF_TYPE>::load_ref(*this,ref);
-          return *this;
-      }
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_ref (REF_TYPE &ref) const {
-          RefBridge<REF_TYPE>::store_ref(*this,ref);
-          return *this;
-      }
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void store_ref(self_type const &self, RefType &ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-      
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
-      {
-
-          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void load_ref (self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed ++;
-              #endif
-                self.load_packed(ptr);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed_n ++;
-              #endif
-                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided ++;
-              #endif
-                self.load_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided_n ++;
-              #endif
-                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-
-
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed ++;
-    #endif
-                self.store_packed(ptr);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed_n ++;
-    #endif
-                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided ++;
-    #endif
-                self.store_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided_n ++;
-    #endif
-                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-           
-
-      };
-
-
-
-
-
-      
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, INDEX_TYPE STRIDE_VALUE, INDEX_TYPE BEGIN_VALUE, INDEX_TYPE SIZE_VALUE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>> 
-      {
-
-          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by StaticTensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void load_ref (self_type &self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed ++;
-              #endif
-                self.load_packed(ptr);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed_n ++;
-              #endif
-                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided ++;
-              #endif
-                self.load_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided_n ++;
-              #endif
-                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-
-
-
-          /*!
-           * @brief Performs load specified by StaticTensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed ++;
-    #endif
-                self.store_packed(ptr);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed_n ++;
-    #endif
-                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided ++;
-    #endif
-                self.store_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided_n ++;
-    #endif
-                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-           
-
-      };
-     
-
-
-
-      /*!
-       * Loads a dense full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr)
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0) {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed++;
+#endif
+          self.store_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
+        // partial
+        else {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed_n++;
+#endif
+          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
-      }
 
-      /*!
-       * Loads a strided full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, int stride)
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
-        }
-        return *this;
       }
-
-      /*!
-       * Loads a dense partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, int N)
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
-          }
-          else{
-            m_registers[reg].load_packed_n(ptr+reg*s_register_num_elem,
-                                           N-reg*s_register_num_elem);
-
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
-
+      // strided data
+      else {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided++;
+#endif
+          self.store_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_packed_n(
-              ptr+s_final_register*s_register_num_elem,
-              N-s_final_register*s_register_num_elem);
+        // partial
+        else {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided_n++;
+#endif
+          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
+  };
 
-      /*!
-       * Loads a strided partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr,
-          int stride, int N)
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
-          }
-          else{
-            m_registers[reg].load_strided_n(ptr+reg*s_register_num_elem*stride,
-                                            stride,
-                                            N-reg*s_register_num_elem);
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
 
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE STRIDE_VALUE,
+            INDEX_TYPE BEGIN_VALUE,
+            INDEX_TYPE SIZE_VALUE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
+      camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
+      camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
+      STRIDE_ONE_DIM>> {
+
+    using RefType = RAJA::internal::expt::StaticTensorRef<
+        POINTER_TYPE,
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
+        camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
+        camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
+        STRIDE_ONE_DIM>;
+
+    /*!
+     * @brief Performs load specified by StaticTensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void load_ref(self_type &self, RefType const &ref)
+    {
+
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
+
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0) {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed++;
+#endif
+          self.load_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_strided_n(
-              ptr+s_final_register*s_register_num_elem*stride,
-              stride,
-              N-s_final_register*s_register_num_elem);
+        // partial
+        else {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed_n++;
+#endif
+          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
-      }
-
 
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].gather(ptr, offsets.vec(reg));
+      }
+      // strided data
+      else {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided++;
+#endif
+          self.load_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+        // partial
+        else {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
 
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].gather(ptr, offsets.vec(reg));
-          }
-          else{
-            m_registers[reg].gather_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].gather_n(
-              ptr,
-              offsets.vec(s_final_register),
-              N-s_final_register*s_register_num_elem);
-        }
-        return *this;
-      }
+    /*!
+     * @brief Performs load specified by StaticTensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void store_ref(self_type const &self, RefType &ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-      /*!
-       * Loads a dense full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0) {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed++;
+#endif
+          self.store_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
+        // partial
+        else {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed_n++;
+#endif
+          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
-      }
 
-      /*!
-       * Loads a strided full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, int stride) const
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
-        }
-        return *this;
       }
-
-      /*!
-       * Loads a dense partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, int N) const
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
-          }
-          else{
-            m_registers[reg].store_packed_n(ptr+reg*s_register_num_elem,
-                                           N-reg*s_register_num_elem);
-            return *this;
-          }
-
+      // strided data
+      else {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL) {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided++;
+#endif
+          self.store_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_packed_n(
-              ptr+s_final_register*s_register_num_elem,
-              N-s_final_register*s_register_num_elem);
+        // partial
+        else {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided_n++;
+#endif
+          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
+  };
 
-      /*!
-       * Loads a strided partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type  *ptr,
-          int stride, int N) const
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
-          }
-          else{
-            m_registers[reg].store_strided_n(ptr+reg*s_register_num_elem*stride,
-                                            stride,
-                                            N-reg*s_register_num_elem);
-            return *this;
-          }
-
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_strided_n(
-              ptr+s_final_register*s_register_num_elem*stride,
-              stride,
-              N-s_final_register*s_register_num_elem);
-        }
-        return *this;
-      }
 
+  /*!
+   * Loads a dense full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].load_packed_n(ptr + s_final_register *
+                                                            s_register_num_elem,
+                                                  s_num_partial_lanes);
+    }
+    return *this;
+  }
 
+  /*!
+   * Loads a strided full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, int stride)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                    stride);
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].load_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride,
+          stride,
+          s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, int_vector_type const &offsets) const {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].scatter(ptr, offsets.vec(reg));
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].scatter_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+  /*!
+   * Loads a dense partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, int N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      if (N >= reg * s_register_num_elem + s_register_num_elem) {
+        m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
+      } else {
+        m_registers[reg].load_packed_n(ptr + reg * s_register_num_elem,
+                                       N - reg * s_register_num_elem);
+
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r) {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].load_packed_n(ptr + s_final_register *
+                                                            s_register_num_elem,
+                                                  N - s_final_register *
+                                                          s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, int_vector_type const &offsets, camp::idx_t N) const {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].scatter(ptr, offsets.vec(reg));
-          }
-          else{
-            m_registers[reg].scatter_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
-
-            return *this;
-          }
-
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].scatter_n(
-              ptr,
-              offsets.vec(s_final_register),
-              N-s_num_full_registers*s_register_num_elem);
+  /*!
+   * Loads a strided partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr, int stride, int N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      if (N >= reg * s_register_num_elem + s_register_num_elem) {
+        m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                      stride);
+      } else {
+        m_registers[reg].load_strided_n(ptr +
+                                            reg * s_register_num_elem * stride,
+                                        stride,
+                                        N - reg * s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r) {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].load_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride,
+          stride,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &den) const {
-        self_type result;
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          result.vec(reg) = m_registers[reg].divide(den.vec(reg));
-        }
-        if(s_num_partial_lanes){
-          result.vec(s_final_register) = m_registers[s_final_register].divide_n(den.vec(s_final_register), s_num_partial_lanes);
-        }
-        return result;
-      }
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type &gather(element_type const *ptr, int_vector_type offsets)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      m_registers[reg].gather(ptr, offsets.vec(reg));
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].gather_n(ptr,
+                                             offsets.vec(s_final_register),
+                                             s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Divide n elements of this vector by another vector
-       * @param x Vector to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t n) const {
-        self_type q(*this);
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(this->get(i) / b.get(i), i);
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type &gather_n(element_type const *ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      if (N >= reg * s_register_num_elem + s_register_num_elem) {
+        m_registers[reg].gather(ptr, offsets.vec(reg));
+      } else {
+        m_registers[reg].gather_n(ptr,
+                                  offsets.vec(reg),
+                                  N - reg * s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r) {
+          m_registers[r].broadcast(0);
         }
-        return q;
+        return *this;
       }
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].gather_n(ptr,
+                                             offsets.vec(s_final_register),
+                                             N - s_final_register *
+                                                     s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Divide n elements of this vector by a scalar
-       * @param x Scalar to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(element_type const &b, camp::idx_t n) const {
-        self_type q(*this);
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(this->get(i) / b, i);
-        }
-        return q;
-      }
 
+  /*!
+   * Loads a dense full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].store_packed_n(
+          ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min() const
-      {
-        // special case where there's just one parital register
-        if(s_num_full_registers == 0){
-          return m_registers[0].min_n(s_num_partial_lanes);
-        }
+  /*!
+   * Loads a strided full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, int stride) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                     stride);
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].store_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride,
+          stride,
+          s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-        element_type result = m_registers[0].min();
-        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
-          result = RAJA::min<element_type>(result, m_registers[i].min());
-        }
-        if(s_num_partial_lanes){
-          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(s_num_partial_lanes));
-        }
-        return result;
+  /*!
+   * Loads a dense partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, int N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      if (N >= reg * s_register_num_elem + s_register_num_elem) {
+        m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
+      } else {
+        m_registers[reg].store_packed_n(ptr + reg * s_register_num_elem,
+                                        N - reg * s_register_num_elem);
+        return *this;
       }
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].store_packed_n(
+          ptr + s_final_register * s_register_num_elem,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Returns the smallest element over the first N lanes
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min_n(int N) const
-      {
-        // special case where there's just one parital register
-        if(N < s_register_num_elem){
-          return m_registers[0].min_n(N);
-        }
-
-        element_type result = m_registers[0].min();
-        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            result = RAJA::min<element_type>(result, m_registers[reg].min());
-          }
-          else{
-            return RAJA::min<element_type>(result, m_registers[reg].min_n(N-reg*s_register_num_elem));
-          }
-        }
-        if(N-s_num_full_registers*s_register_num_elem > 0){
-          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(N-s_final_register*s_register_num_elem));
-        }
-        return result;
+  /*!
+   * Loads a strided partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr, int stride, int N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      if (N >= reg * s_register_num_elem + s_register_num_elem) {
+        m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                       stride);
+      } else {
+        m_registers[reg].store_strided_n(ptr +
+                                             reg * s_register_num_elem * stride,
+                                         stride,
+                                         N - reg * s_register_num_elem);
+        return *this;
       }
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].store_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride,
+          stride,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max() const
-      {
-        // special case where there's just one parital register
-        if(s_num_full_registers == 0){
-          return m_registers[0].max_n(s_num_partial_lanes);
-        }
 
-        element_type result = m_registers[0].max();
-        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
-          result = RAJA::max<element_type>(result, m_registers[i].max());
-        }
-        if(s_num_partial_lanes){
-          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(s_num_partial_lanes));
-        }
-        return result;
-      }
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &scatter(element_type *ptr,
+                           int_vector_type const &offsets) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      m_registers[reg].scatter(ptr, offsets.vec(reg));
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].scatter_n(ptr,
+                                              offsets.vec(s_final_register),
+                                              s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Returns the largest element over the first N lanes
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max_n(int N) const
-      {
-        // special case where there's just one parital register
-        if(N < s_register_num_elem){
-          return m_registers[0].max_n(N);
-        }
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &scatter_n(element_type *ptr,
+                             int_vector_type const &offsets,
+                             camp::idx_t N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      if (N >= reg * s_register_num_elem + s_register_num_elem) {
+        m_registers[reg].scatter(ptr, offsets.vec(reg));
+      } else {
+        m_registers[reg].scatter_n(ptr,
+                                   offsets.vec(reg),
+                                   N - reg * s_register_num_elem);
 
-        element_type result = m_registers[0].max();
-        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            result = RAJA::max<element_type>(result, m_registers[reg].max());
-          }
-          else{
-            return RAJA::max<element_type>(result, m_registers[reg].max_n(N-reg*s_register_num_elem));
-          }
-        }
-        if(N-s_num_full_registers*s_register_num_elem > 0){
-          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(N-s_final_register*s_register_num_elem));
-        }
-        return result;
+        return *this;
       }
+    }
+    if (s_num_partial_lanes) {
+      m_registers[s_final_register].scatter_n(ptr,
+                                              offsets.vec(s_final_register),
+                                              N - s_num_full_registers *
+                                                      s_register_num_elem);
+    }
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &den) const
+  {
+    self_type result;
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg) {
+      result.vec(reg) = m_registers[reg].divide(den.vec(reg));
+    }
+    if (s_num_partial_lanes) {
+      result.vec(s_final_register) =
+          m_registers[s_final_register].divide_n(den.vec(s_final_register),
+                                                 s_num_partial_lanes);
+    }
+    return result;
+  }
 
-      /*!
-       * @brief Returns the sum of all elements
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type sum() const
-      {
-        // first do a vector sum of all registers
-        register_type s = m_registers[0];
-        for(camp::idx_t i = 1;i < s_num_registers;++ i){
-          s += m_registers[i];
-        }
-        // then a horizontal sum of result
-        return s.sum();
-      }
+  /*!
+   * @brief Divide n elements of this vector by another vector
+   * @param x Vector to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t n) const
+  {
+    self_type q(*this);
+    for (camp::idx_t i = 0; i < n; ++i) {
+      q.set(this->get(i) / b.get(i), i);
+    }
+    return q;
+  }
 
+  /*!
+   * @brief Divide n elements of this vector by a scalar
+   * @param x Scalar to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(element_type const &b, camp::idx_t n) const
+  {
+    self_type q(*this);
+    for (camp::idx_t i = 0; i < n; ++i) {
+      q.set(this->get(i) / b, i);
+    }
+    return q;
+  }
 
-      /*!
-       * @brief The * operator of two vectors is a element-wise multiply
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator*(self_type const &x) const {
-        return this->multiply(x);
-      }
 
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min() const
+  {
+    // special case where there's just one parital register
+    if (s_num_full_registers == 0) {
+      return m_registers[0].min_n(s_num_partial_lanes);
+    }
+
+    element_type result = m_registers[0].min();
+    for (camp::idx_t i = 1; i < s_num_full_registers; ++i) {
+      result = RAJA::min<element_type>(result, m_registers[i].min());
+    }
+    if (s_num_partial_lanes) {
+      result = RAJA::min<element_type>(result,
+                                       m_registers[s_final_register].min_n(
+                                           s_num_partial_lanes));
+    }
+    return result;
+  }
 
-      /*!
-       * @brief The dot product of two vectors
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type dot(self_type const &x) const {
-        element_type dp(0);
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          dp += m_registers[i].dot(x.vec(i));
-        }
-        return dp;
+  /*!
+   * @brief Returns the smallest element over the first N lanes
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min_n(int N) const
+  {
+    // special case where there's just one parital register
+    if (N < s_register_num_elem) {
+      return m_registers[0].min_n(N);
+    }
+
+    element_type result = m_registers[0].min();
+    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg) {
+      if (N >= reg * s_register_num_elem + s_register_num_elem) {
+        result = RAJA::min<element_type>(result, m_registers[reg].min());
+      } else {
+        return RAJA::min<element_type>(result,
+                                       m_registers[reg].min_n(
+                                           N - reg * s_register_num_elem));
       }
+    }
+    if (N - s_num_full_registers * s_register_num_elem > 0) {
+      result = RAJA::min<element_type>(result,
+                                       m_registers[s_final_register].min_n(
+                                           N - s_final_register *
+                                                   s_register_num_elem));
+    }
+    return result;
+  }
 
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max() const
+  {
+    // special case where there's just one parital register
+    if (s_num_full_registers == 0) {
+      return m_registers[0].max_n(s_num_partial_lanes);
+    }
+
+    element_type result = m_registers[0].max();
+    for (camp::idx_t i = 1; i < s_num_full_registers; ++i) {
+      result = RAJA::max<element_type>(result, m_registers[i].max());
+    }
+    if (s_num_partial_lanes) {
+      result = RAJA::max<element_type>(result,
+                                       m_registers[s_final_register].max_n(
+                                           s_num_partial_lanes));
+    }
+    return result;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &set(element_type val, int idx){
-        m_registers[to_register(idx)].set(val, to_lane(idx));
-        return *this;
+  /*!
+   * @brief Returns the largest element over the first N lanes
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max_n(int N) const
+  {
+    // special case where there's just one parital register
+    if (N < s_register_num_elem) {
+      return m_registers[0].max_n(N);
+    }
+
+    element_type result = m_registers[0].max();
+    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg) {
+      if (N >= reg * s_register_num_elem + s_register_num_elem) {
+        result = RAJA::max<element_type>(result, m_registers[reg].max());
+      } else {
+        return RAJA::max<element_type>(result,
+                                       m_registers[reg].max_n(
+                                           N - reg * s_register_num_elem));
       }
+    }
+    if (N - s_num_full_registers * s_register_num_elem > 0) {
+      result = RAJA::max<element_type>(result,
+                                       m_registers[s_final_register].max_n(
+                                           N - s_final_register *
+                                                   s_register_num_elem));
+    }
+    return result;
+  }
+
+  /*!
+   * @brief Returns the sum of all elements
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type sum() const
+  {
+    // first do a vector sum of all registers
+    register_type s = m_registers[0];
+    for (camp::idx_t i = 1; i < s_num_registers; ++i) {
+      s += m_registers[i];
+    }
+    // then a horizontal sum of result
+    return s.sum();
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type get(int idx) const {
-        return m_registers[to_register(idx)].get(to_lane(idx));
-      }
 
+  /*!
+   * @brief The * operator of two vectors is a element-wise multiply
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator*(self_type const &x) const { return this->multiply(x); }
 
 
-      /*!
-       * @brief Converts to vector to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string() const {
-        std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
+  /*!
+   * @brief The dot product of two vectors
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type dot(self_type const &x) const
+  {
+    element_type dp(0);
+    for (camp::idx_t i = 0; i < s_num_registers; ++i) {
+      dp += m_registers[i].dot(x.vec(i));
+    }
+    return dp;
+  }
 
-        //
-        for(camp::idx_t i = 0;i < s_num_elem; ++ i){
-          s += std::to_string(this->get(i)) + " ";
-        }
 
-        camp::idx_t physical_size = s_num_registers * s_register_num_elem;
-        if(s_num_elem < physical_size){
-          s += "{";
-          for(camp::idx_t i = s_num_elem;i < physical_size; ++ i){
-            s += std::to_string(this->get(i)) + " ";
-          }
-          s += "}";
-        }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &set(element_type val, int idx)
+  {
+    m_registers[to_register(idx)].set(val, to_lane(idx));
+    return *this;
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type get(int idx) const
+  {
+    return m_registers[to_register(idx)].get(to_lane(idx));
+  }
 
-        s += " ]\n";
 
-        return s;
+  /*!
+   * @brief Converts to vector to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string() const
+  {
+    std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
+
+    //
+    for (camp::idx_t i = 0; i < s_num_elem; ++i) {
+      s += std::to_string(this->get(i)) + " ";
+    }
+
+    camp::idx_t physical_size = s_num_registers * s_register_num_elem;
+    if (s_num_elem < physical_size) {
+      s += "{";
+      for (camp::idx_t i = s_num_elem; i < physical_size; ++i) {
+        s += std::to_string(this->get(i)) + " ";
       }
+      s += "}";
+    }
 
 
-  };
+    s += " ]\n";
+
+    return s;
+  }
+};
 
 
-} // namespace expt
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/stats.hpp b/include/RAJA/pattern/tensor/stats.hpp
index 77b70faf00..011d2a4f54 100644
--- a/include/RAJA/pattern/tensor/stats.hpp
+++ b/include/RAJA/pattern/tensor/stats.hpp
@@ -31,9 +31,8 @@ namespace RAJA
 {
 namespace expt
 {
-struct tensor_stats
-{
-    static int indent;
+struct tensor_stats {
+  static int indent;
 
   static camp::idx_t num_vector_copy;
   static camp::idx_t num_vector_copy_ctor;
@@ -77,10 +76,9 @@ struct tensor_stats
 
   static void resetVectorStats();
   static void printVectorStats();
-
 };
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index defa08585a..d1064239f5 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -18,16 +18,13 @@
 #ifndef RAJA_MultiPolicy_HPP
 #define RAJA_MultiPolicy_HPP
 
-#include "RAJA/config.hpp"
-
 #include <tuple>
 
-#include "RAJA/policy/PolicyBase.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/internal/get_platform.hpp"
-#include "RAJA/util/plugins.hpp"
-
+#include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/util/concepts.hpp"
+#include "RAJA/util/plugins.hpp"
 #include "RAJA/util/resource.hpp"
 
 
@@ -97,10 +94,11 @@ template <typename Res,
           typename Body,
           typename Selector,
           typename... Policies>
-RAJA_INLINE resources::EventProxy<Res> forall_impl(Res r,
-                                  MultiPolicy<Selector, Policies...> p,
-                                  Iterable &&iter,
-                                  Body &&body)
+RAJA_INLINE resources::EventProxy<Res> forall_impl(
+    Res r,
+    MultiPolicy<Selector, Policies...> p,
+    Iterable &&iter,
+    Body &&body)
 {
   p.invoke(iter, body);
   return resources::EventProxy<Res>(r);
@@ -153,8 +151,9 @@ RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(std::tuple<Policies...> policies, Selector s)
     -> MultiPolicy<Selector, Policies...>
 {
-  return detail::make_multi_policy(
-      camp::make_idx_seq_t<sizeof...(Policies)>{}, s, policies);
+  return detail::make_multi_policy(camp::make_idx_seq_t<sizeof...(Policies)>{},
+                                   s,
+                                   policies);
 }
 
 namespace detail
@@ -190,7 +189,9 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
 
       util::callPostLaunchPlugins(context);
     } else {
-      NextInvoker::invoke(offset, std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+      NextInvoker::invoke(offset,
+                          std::forward<Iterable>(iter),
+                          std::forward<LoopBody>(loop_body));
     }
   }
 };
@@ -214,7 +215,7 @@ struct policy_invoker<0, size, Policy, rest...> {
 
       util::callPreLaunchPlugins(context);
 
-      //std::cout <<"policy_invoker: No index\n";
+      // std::cout <<"policy_invoker: No index\n";
       using policy::multi::forall_impl;
       RAJA_FORCEINLINE_RECURSIVE
       auto r = resources::get_resource<Policy>::type::get_default();
@@ -234,7 +235,8 @@ namespace type_traits
 
 template <typename T>
 struct is_multi_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy, typename std::decay<T>::type> {
+    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy,
+                                            typename std::decay<T>::type> {
 };
 }  // namespace type_traits
 
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index 898c92a621..c1097ba7e8 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -18,11 +18,11 @@
 #ifndef RAJA_POLICYBASE_HPP
 #define RAJA_POLICYBASE_HPP
 
+#include <cstddef>
+
 #include "RAJA/util/camp_aliases.hpp"
 #include "RAJA/util/concepts.hpp"
 
-#include <cstddef>
-
 namespace RAJA
 {
 
@@ -93,8 +93,9 @@ template <typename PolicyType, RAJA::Policy P_>
 struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_> {
 };
 
-template <typename PolicyType, RAJA::Policy ... Ps_>
-struct policy_any_of : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value> {
+template <typename PolicyType, RAJA::Policy... Ps_>
+struct policy_any_of
+    : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value> {
 };
 
 template <typename PolicyType, RAJA::Pattern P_>
@@ -112,17 +113,18 @@ struct platform_is
 };
 
 template <typename PolicyType, typename Trait>
-struct policy_has_trait_impl
-    : camp::num<false> {
+struct policy_has_trait_impl : camp::num<false> {
 };
 ///
-template <typename Trait, Policy Policy_,
-                          Pattern Pattern_,
-                          Launch Launch_,
-                          Platform Platform_,
-                          typename... Traits>
+template <typename Trait,
+          Policy Policy_,
+          Pattern Pattern_,
+          Launch Launch_,
+          Platform Platform_,
+          typename... Traits>
 struct policy_has_trait_impl<
-      PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>, Trait>
+    PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>,
+    Trait>
     : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value> {
 };
 ///
@@ -159,10 +161,7 @@ template <Policy Policy_,
 using make_policy_pattern_launch_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Args...>;
 
-template <Policy Policy_,
-          Pattern Pattern_,
-          Launch Launch_,
-          typename... Args>
+template <Policy Policy_, Pattern Pattern_, Launch Launch_, typename... Args>
 using make_policy_pattern_launch_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform::undefined, Args...>;
 
@@ -230,7 +229,8 @@ struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce> {
 };
 
 template <typename Pol>
-struct is_multi_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce> {
+struct is_multi_reduce_policy
+    : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce> {
 };
 
 }  // end namespace type_traits
diff --git a/include/RAJA/policy/WorkGroup.hpp b/include/RAJA/policy/WorkGroup.hpp
index cae78d2493..0451642445 100644
--- a/include/RAJA/policy/WorkGroup.hpp
+++ b/include/RAJA/policy/WorkGroup.hpp
@@ -19,13 +19,10 @@
 #define RAJA_Policy_WorkGroup_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/policy/PolicyBase.hpp"
-
 #include "RAJA/internal/get_platform.hpp"
-#include "RAJA/util/plugins.hpp"
-
+#include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/util/concepts.hpp"
+#include "RAJA/util/plugins.hpp"
 
 namespace RAJA
 {
@@ -39,16 +36,14 @@ namespace workgroup
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_order> {
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order> {
 };
 /// execute the enqueued loops in the reverse order from the order that they
 /// were enqueued
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct reverse_ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_order> {
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order> {
 };
 
 /// store an array of pointers to the enqueued objects. The enqueued objects
@@ -85,29 +80,34 @@ struct indirect_virtual_function_dispatch
 /// RangeAndCallables is a pack of types of the form camp::list<Range, Callable>
 /// where pairs of Range and Callable are the types of the range and callable
 /// objects that may be passed to WorkPool enqueue.
-template < typename ... RangeAndCallables >
+template <typename... RangeAndCallables>
 struct direct_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
                                   Pattern::workgroup_dispatch> {
 };
 
-template < typename EXEC_POLICY_T,
-           typename ORDER_POLICY_T,
-           typename STORAGE_POLICY_T,
-           typename DISPATCH_POLICY_T = indirect_function_call_dispatch >
-struct WorkGroupPolicy
-    : public RAJA::make_policy_pattern_platform_t<
-                       policy_of<EXEC_POLICY_T>::value,
-                       Pattern::workgroup,
-                       platform_of<EXEC_POLICY_T>::value> {
-  static_assert(RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T = indirect_function_call_dispatch>
+struct WorkGroupPolicy : public RAJA::make_policy_pattern_platform_t<
+                             policy_of<EXEC_POLICY_T>::value,
+                             Pattern::workgroup,
+                             platform_of<EXEC_POLICY_T>::value> {
+  static_assert(
+      RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
       "WorkGroupPolicy: EXEC_POLICY_T must be a workgroup exec policy");
-  static_assert(RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
+  static_assert(
+      RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
       "WorkGroupPolicy: ORDER_POLICY_T must be a workgroup order policy");
-  static_assert(RAJA::pattern_is<STORAGE_POLICY_T, RAJA::Pattern::workgroup_storage>::value,
-      "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage policy");
-  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T, RAJA::Pattern::workgroup_dispatch>::value,
-      "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup dispatch policy");
+  static_assert(RAJA::pattern_is<STORAGE_POLICY_T,
+                                 RAJA::Pattern::workgroup_storage>::value,
+                "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage "
+                "policy");
+  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T,
+                                 RAJA::Pattern::workgroup_dispatch>::value,
+                "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup "
+                "dispatch policy");
 };
 
 }  // end namespace workgroup
@@ -117,12 +117,12 @@ using policy::workgroup::ordered;
 using policy::workgroup::reverse_ordered;
 
 using policy::workgroup::array_of_pointers;
-using policy::workgroup::ragged_array_of_objects;
 using policy::workgroup::constant_stride_array_of_objects;
+using policy::workgroup::ragged_array_of_objects;
 
+using policy::workgroup::direct_dispatch;
 using policy::workgroup::indirect_function_call_dispatch;
 using policy::workgroup::indirect_virtual_function_dispatch;
-using policy::workgroup::direct_dispatch;
 
 using policy::workgroup::WorkGroupPolicy;
 
diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp
index e0ca557b32..017b3922c9 100644
--- a/include/RAJA/policy/atomic_auto.hpp
+++ b/include/RAJA/policy/atomic_auto.hpp
@@ -19,11 +19,10 @@
 #define RAJA_policy_atomic_auto_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/util/macros.hpp"
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
 /*!
@@ -106,9 +105,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T *acc)
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic,
-                                         T *acc,
-                                         T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T *acc, T compare)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc, compare);
 }
@@ -120,9 +117,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T *acc)
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic,
-                                         T *acc,
-                                         T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T *acc, T compare)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc, compare);
 }
@@ -146,9 +141,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T *acc, T value)
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic,
-                                              T *acc,
-                                              T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic, T *acc, T value)
 {
   return atomicExchange(RAJA_AUTO_ATOMIC, acc, value);
 }
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index e43bd71386..1098dd93a1 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -18,11 +18,12 @@
 #ifndef RAJA_policy_atomic_builtin_HPP
 #define RAJA_policy_atomic_builtin_HPP
 
-#include "RAJA/config.hpp"
-
 #include <cstdint>
 
-#if defined(RAJA_COMPILER_MSVC) || ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_COMPILER_MSVC) || \
+    ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
 #include <intrin.h>
 #endif
 
@@ -45,10 +46,12 @@ struct builtin_atomic {
 };
 
 
-namespace detail {
+namespace detail
+{
 
 
-#if defined(RAJA_COMPILER_MSVC) || ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) || \
+    ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
 
 
 /*!
@@ -58,10 +61,8 @@ namespace detail {
 template <typename T>
 struct builtin_useIntrinsic {
   static constexpr bool value =
-    std::is_same<T, char>::value ||
-    std::is_same<T, short>::value ||
-    std::is_same<T, long>::value ||
-    std::is_same<T, long long>::value;
+      std::is_same<T, char>::value || std::is_same<T, short>::value ||
+      std::is_same<T, long>::value || std::is_same<T, long long>::value;
 };
 
 
@@ -72,16 +73,15 @@ struct builtin_useIntrinsic {
 template <typename T>
 struct builtin_useReinterpret {
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 ||
-     sizeof(T) == 2 ||
-     sizeof(T) == 4 ||
-     sizeof(T) == 8);
-
-  using type =
-    std::conditional_t<sizeof(T) == 1, char,
-    std::conditional_t<sizeof(T) == 2, short,
-    std::conditional_t<sizeof(T) == 4, long, long long>>>;
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+
+  using type = std::conditional_t<
+      sizeof(T) == 1,
+      char,
+      std::conditional_t<sizeof(T) == 2,
+                         short,
+                         std::conditional_t<sizeof(T) == 4, long, long long>>>;
 };
 
 
@@ -92,8 +92,8 @@ struct builtin_useReinterpret {
 template <typename T>
 struct builtin_useCAS {
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -199,7 +199,9 @@ RAJA_INLINE long builtin_atomicCAS(long *acc, long compare, long value)
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long long value)
+RAJA_INLINE long long builtin_atomicCAS(long long *acc,
+                                        long long compare,
+                                        long long value)
 {
   return _InterlockedCompareExchange64(acc, value, compare);
 }
@@ -329,8 +331,8 @@ RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
 template <typename T>
 struct builtin_useIntrinsic {
   static constexpr bool value =
-    (std::is_integral<T>::value || std::is_enum<T>::value) &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      (std::is_integral<T>::value || std::is_enum<T>::value) &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -340,53 +342,52 @@ struct builtin_useIntrinsic {
  */
 template <typename T>
 struct builtin_useReinterpret {
-  static constexpr bool value =
-    !std::is_integral<T>::value &&
-    !std::is_enum<T>::value &&
-    ((sizeof(T) == 1
+  static constexpr bool value = !std::is_integral<T>::value &&
+                                !std::is_enum<T>::value &&
+                                ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-      && sizeof(unsigned char) == 1
+                                  && sizeof(unsigned char) == 1
 #endif
-     ) ||
-     (sizeof(T) == 2
+                                  ) ||
+                                 (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-      && sizeof(unsigned short) == 2
+                                  && sizeof(unsigned short) == 2
 #endif
-     ) ||
-     (sizeof(T) == 4
+                                  ) ||
+                                 (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-      && sizeof(unsigned int) == 4
+                                  && sizeof(unsigned int) == 4
 #endif
-     ) ||
-     (sizeof(T) == 8
+                                  ) ||
+                                 (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-      && sizeof(unsigned long long) == 8
+                                  && sizeof(unsigned long long) == 8
 #endif
-     ));
+                                  ));
 
   using type =
-    std::conditional_t<sizeof(T) == 1,
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                       uint8_t,
+                         uint8_t,
 #else
-                       unsigned char,
+                         unsigned char,
 #endif
-    std::conditional_t<sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                       uint16_t,
+                                            uint16_t,
 #else
-                       unsigned short,
+                                            unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                       uint32_t,
+                                                               uint32_t,
 #else
-                       unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                       uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-                       unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
@@ -398,8 +399,8 @@ struct builtin_useReinterpret {
 template <typename T>
 struct builtin_useCAS {
   static constexpr bool value =
-    !std::is_integral<T>::value && !std::is_enum<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !std::is_integral<T>::value && !std::is_enum<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -534,7 +535,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
   using R = builtin_useReinterpret_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicLoad(reinterpret_cast<R*>(acc)));
+      builtin_atomicLoad(reinterpret_cast<R *>(acc)));
 }
 
 
@@ -547,7 +548,7 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  builtin_atomicStore(reinterpret_cast<R*>(acc),
+  builtin_atomicStore(reinterpret_cast<R *>(acc),
                       RAJA::util::reinterp_A_as_B<T, R>(value));
 }
 
@@ -562,8 +563,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
   using R = builtin_useReinterpret_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicExchange(reinterpret_cast<R*>(acc),
-                           RAJA::util::reinterp_A_as_B<T, R>(value)));
+      builtin_atomicExchange(reinterpret_cast<R *>(acc),
+                             RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -577,9 +578,9 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
   using R = builtin_useReinterpret_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicCAS(reinterpret_cast<R*>(acc),
-                      RAJA::util::reinterp_A_as_B<T, R>(compare),
-                      RAJA::util::reinterp_A_as_B<T, R>(value)));
+      builtin_atomicCAS(reinterpret_cast<R *>(acc),
+                        RAJA::util::reinterp_A_as_B<T, R>(compare),
+                        RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -622,8 +623,7 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
  * Returns the OLD value that was replaced by the result of this operation.
  */
 template <typename T, typename Oper>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
-                                                     Oper &&oper)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc, Oper &&oper)
 {
   T old = builtin_atomicLoad(acc);
   T expected;
@@ -673,65 +673,50 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
 /*!
  * Atomic addition using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
 
 /*!
  * Atomic subtraction using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
 
 /*!
  * Atomic and using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
 
 /*!
  * Atomic or using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 
 /*!
  * Atomic xor using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
 
@@ -766,26 +751,18 @@ template <typename T>
 RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T *acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc,
+      [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
 template <typename T>
 RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T *acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc,
+      [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
 template <typename T>
@@ -797,7 +774,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc)
 template <typename T>
 RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
+  return detail::builtin_atomicCAS_loop(acc, [value](T old) {
     return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
@@ -811,8 +788,9 @@ RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc)
 template <typename T>
 RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  return detail::builtin_atomicCAS_loop(acc, [value](T old) {
+    return old == static_cast<T>(0) || value < old ? value
+                                                   : old - static_cast<T>(1);
   });
 }
 
@@ -841,7 +819,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T *acc, T value)
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicCAS(builtin_atomic, T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T
+atomicCAS(builtin_atomic, T *acc, T compare, T value)
 {
   return detail::builtin_atomicCAS(acc, compare, value);
 }
diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
index e9d5bc454f..b372471ebd 100644
--- a/include/RAJA/policy/cuda.hpp
+++ b/include/RAJA/policy/cuda.hpp
@@ -28,19 +28,19 @@
 #include <cuda_runtime.h>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
+#include "RAJA/policy/cuda/WorkGroup.hpp"
 #include "RAJA/policy/cuda/forall.hpp"
+#include "RAJA/policy/cuda/kernel.hpp"
+#include "RAJA/policy/cuda/launch.hpp"
+#include "RAJA/policy/cuda/multi_reduce.hpp"
 #include "RAJA/policy/cuda/policy.hpp"
 #include "RAJA/policy/cuda/reduce.hpp"
-#include "RAJA/policy/cuda/multi_reduce.hpp"
 #include "RAJA/policy/cuda/scan.hpp"
 #include "RAJA/policy/cuda/sort.hpp"
-#include "RAJA/policy/cuda/kernel.hpp"
 #include "RAJA/policy/cuda/synchronize.hpp"
-#include "RAJA/policy/cuda/launch.hpp"
-#include "RAJA/policy/cuda/WorkGroup.hpp"
 
 #endif  // closing endif for if defined(RAJA_ENABLE_CUDA)
 
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 88a89d5362..6102a923c7 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -30,14 +30,13 @@
 #include <type_traits>
 #include <unordered_map>
 
+#include "RAJA/policy/cuda/policy.hpp"
+#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 #include "RAJA/util/basic_mempool.hpp"
-#include "RAJA/util/mutex.hpp"
-#include "RAJA/util/types.hpp"
 #include "RAJA/util/macros.hpp"
+#include "RAJA/util/mutex.hpp"
 #include "RAJA/util/resource.hpp"
-
-#include "RAJA/policy/cuda/policy.hpp"
-#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
+#include "RAJA/util/types.hpp"
 
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
 #include "nvToolsExt.h"
@@ -141,8 +140,10 @@ struct DevicePinnedAllocator {
     cudaErrchk(cudaGetDevice(&device));
     void* ptr;
     cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
+    cudaErrchk(
+        cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
+    cudaErrchk(cudaMemAdvise(
+        ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
 
     return ptr;
   }
@@ -158,7 +159,8 @@ struct DevicePinnedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type =
+    basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
@@ -170,7 +172,7 @@ struct cudaInfo {
   cuda_dim_t gridDim{0, 0, 0};
   cuda_dim_t blockDim{0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0,0)};
+  ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0, 0)};
   bool setup_reducers = false;
 };
 struct cudaStatusInfo : cudaInfo {
@@ -190,10 +192,7 @@ extern cudaStatusInfo tl_status;
 extern std::unordered_map<cudaStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Cuda res)
-{
-  res.wait();
-}
+void synchronize_impl(::RAJA::resources::Cuda res) { res.wait(); }
 
 }  // namespace detail
 
@@ -254,17 +253,24 @@ void launch(::RAJA::resources::Cuda res, bool async = true)
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func, cuda_dim_t gridDim, cuda_dim_t blockDim, void** args, size_t shmem,
-            ::RAJA::resources::Cuda res, bool async = true, const char *name = nullptr)
+void launch(const void* func,
+            cuda_dim_t gridDim,
+            cuda_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Cuda res,
+            bool async = true,
+            const char* name = nullptr)
 {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if(name) nvtxRangePushA(name);
+  if (name) nvtxRangePushA(name);
 #else
   RAJA_UNUSED_VAR(name);
 #endif
-  cudaErrchk(cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+  cudaErrchk(
+      cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if(name) nvtxRangePop();
+  if (name) nvtxRangePop();
 #endif
   launch(res, async);
 }
@@ -283,9 +289,11 @@ cuda_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
-                                             detail::tl_status.gridDim.y *
-                                             detail::tl_status.gridDim.z; }
+cuda_dim_member_t currentGridSize()
+{
+  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
+         detail::tl_status.gridDim.z;
+}
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -293,9 +301,11 @@ cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
-                                              detail::tl_status.blockDim.y *
-                                              detail::tl_status.blockDim.z; }
+cuda_dim_member_t currentBlockSize()
+{
+  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
+         detail::tl_status.blockDim.z;
+}
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -310,7 +320,8 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure =
+    std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -322,19 +333,19 @@ constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template < typename T, typename GetNFromMax >
-RAJA_INLINE
-size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+template <typename T, typename GetNFromMax>
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
   const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-      ? align - (unaligned_shmem % align)
-      : size_t(0);
+                                  ? align - (unaligned_shmem % align)
+                                  : size_t(0);
   const size_t aligned_shmem = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) *
-      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
+                                         max_shmem_bytes / sizeof(T));
 
   if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
@@ -362,7 +373,8 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
     ::RAJA::resources::Cuda res,
     LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(detail::tl_status,
+  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(
+      detail::tl_status,
       detail::cudaInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
@@ -375,10 +387,10 @@ static constexpr size_t cuda_occupancy_uninitialized_size_t =
     std::numeric_limits<size_t>::max();
 
 //! Struct with the maximum theoretical occupancy of the device
-struct CudaFixedMaxBlocksData
-{
+struct CudaFixedMaxBlocksData {
   int device_sm_per_device = cuda::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm = cuda::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm =
+      cuda::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -391,18 +403,17 @@ CudaFixedMaxBlocksData cuda_max_blocks()
 }
 
 //! Struct with the maximum occupancy of a kernel in simple terms
-struct CudaOccMaxBlocksThreadsData
-{
+struct CudaOccMaxBlocksThreadsData {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
   int func_max_blocks_per_device = cuda_occupancy_uninitialized_int;
   int func_max_threads_per_block = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE CudaOccMaxBlocksThreadsData
+cuda_occupancy_max_blocks_threads(const void* func,
+                                  size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksThreadsData data;
 
@@ -410,27 +421,27 @@ CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
-    cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
-
+    cudaErrchk(
+        cudaOccupancyMaxPotentialBlockSize(&data.func_max_blocks_per_device,
+                                           &data.func_max_threads_per_block,
+                                           func,
+                                           func_dynamic_shmem_per_block));
   }
 
   return data;
 }
 
 //! Struct with the maximum occupancy of a kernel in specific terms
-struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
-{
+struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
   int func_threads_per_block = cuda_occupancy_uninitialized_int;
   int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
-RAJA_INLINE
-CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
+RAJA_INLINE CudaOccMaxBlocksData
+cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
@@ -440,30 +451,35 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
     data.func_threads_per_block = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_sm,
+        func,
+        func_threads_per_block,
+        func_dynamic_shmem_per_block));
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE CudaOccMaxBlocksData
+cuda_occupancy_max_blocks(const void* func,
+                          size_t func_dynamic_shmem_per_block,
+                          int func_threads_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
-  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-       data.func_threads_per_block != func_threads_per_block ) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+      data.func_threads_per_block != func_threads_per_block) {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
     data.func_threads_per_block = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_sm,
+        func,
+        func_threads_per_block,
+        func_dynamic_shmem_per_block));
   }
 
   return data;
@@ -496,14 +512,16 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template < typename IdxT, typename Concretizer, typename UniqueMarker>
-struct ConcretizerImpl
-{
-  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
-    : m_func(func)
-    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
-    , m_len(len)
-  { }
+template <typename IdxT, typename Concretizer, typename UniqueMarker>
+struct ConcretizerImpl {
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
+      : m_func(func),
+        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
+        m_len(len)
+  {
+  }
 
   IdxT get_max_block_size() const
   {
@@ -517,7 +535,8 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     if (func_threads_per_block <= func_max_threads_per_block) {
       return func_threads_per_block;
     } else {
@@ -528,7 +547,8 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -536,26 +556,31 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const
   {
-    auto data = cuda_occupancy_max_blocks<UniqueMarker>(
-        m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    auto data =
+        cuda_occupancy_max_blocks<UniqueMarker>(m_func,
+                                                m_func_dynamic_shmem_per_block,
+                                                func_threads_per_block);
+    IdxT func_max_blocks_per_device =
+        Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -563,9 +588,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
 private:
diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
index f6269b36e4..236a435d03 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
@@ -18,16 +18,13 @@
 #ifndef RAJA_cuda_WorkGroup_Dispatcher_HPP
 #define RAJA_cuda_WorkGroup_Dispatcher_HPP
 
-#include "RAJA/config.hpp"
-
-#include "camp/resource.hpp"
-
-#include "RAJA/policy/cuda/policy.hpp"
+#include <mutex>
+#include <thread>
 
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
-
-#include <thread>
-#include <mutex>
+#include "RAJA/policy/cuda/policy.hpp"
+#include "camp/resource.hpp"
 
 
 namespace RAJA
@@ -41,9 +38,9 @@ namespace cuda
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template < typename Factory >
-__global__ void get_value_global(
-    typename Factory::value_type* ptr, Factory factory)
+template <typename Factory>
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -73,7 +70,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -81,8 +78,9 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Cuda::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void *args[] = {(void*)&ptr, (void*)&factory};
+  auto func =
+      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void* args[] = {(void*)&ptr, (void*)&factory};
   cudaErrchk(cudaLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   cudaErrchk(cudaStreamSynchronize(res.get_stream()));
 
@@ -91,7 +89,7 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -101,17 +99,20 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace cuda
 
 /*!
-* Populate and return a Dispatcher object that can be used in device code
-*/
-template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
-inline const Dispatcher_T* get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
+ * Populate and return a Dispatcher object that can be used in device code
+ */
+template <typename T,
+          typename Dispatcher_T,
+          size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+inline const Dispatcher_T* get_Dispatcher(
+    cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
   static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return cuda::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+        return cuda::get_cached_value(std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index 41fe17c84a..00bce070bf 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -19,11 +19,9 @@
 #define RAJA_cuda_WorkGroup_WorkRunner_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/policy/cuda/policy.hpp"
-#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
-
 #include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#include "RAJA/policy/cuda/policy.hpp"
 
 
 namespace RAJA
@@ -36,35 +34,35 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallOrdered<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
+          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+          RAJA::ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...> {
   using base = WorkRunnerForallOrdered<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+      RAJA::ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
   using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -73,9 +71,10 @@ struct WorkRunner<
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -84,7 +83,9 @@ struct WorkRunner<
 
     // Only synchronize if we had something to iterate over
     if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
+      if (!Async) {
+        RAJA::cuda::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -95,35 +96,35 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallReverse<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
+          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+          RAJA::reverse_ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...> {
   using base = WorkRunnerForallReverse<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+      RAJA::reverse_ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
   using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -132,9 +133,10 @@ struct WorkRunner<
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -143,7 +145,9 @@ struct WorkRunner<
 
     // Only synchronize if we had something to iterate over
     if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
+      if (!Async) {
+        RAJA::cuda::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -155,26 +159,28 @@ struct WorkRunner<
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
-struct HoldCudaDeviceXThreadblockLoop
-{
-  template < typename segment_in, typename body_in >
+template <typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
+struct HoldCudaDeviceXThreadblockLoop {
+  template <typename segment_in, typename body_in>
   HoldCudaDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {
+  }
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
     // TODO:: decide when to run hooks, may bypass this and use impl directly
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
-    const index_type stride  = blockDim.x * gridDim.x;
+    const index_type stride = blockDim.x * gridDim.x;
     const auto begin = m_segment.begin();
-    const auto end   = m_segment.end();
+    const auto end = m_segment.end();
     const index_type len(end - begin);
-    for ( index_type i = i_begin; i < len; i += stride ) {
+    for (index_type i = i_begin; i < len; i += stride) {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -184,12 +190,12 @@ struct HoldCudaDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template < size_t BLOCK_SIZE,
-           size_t BLOCKS_PER_SM,
-           typename StorageIter,
-           typename value_type,
-           typename index_type,
-           typename ... Args >
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename StorageIter,
+          typename value_type,
+          typename index_type,
+          typename... Args>
 __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
     void cuda_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -206,21 +212,24 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
-  using exec_policy = RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
-  using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
+    RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...> {
+  using exec_policy =
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+  using order_policy = RAJA::policy::cuda::
+      unordered_cuda_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
   using Allocator = ALLOCATOR_T;
   using index_type = INDEX_T;
@@ -228,14 +237,15 @@ struct WorkRunner<
 
   // The type that will hold the segment and loop body in work storage
   struct holder_type {
-    template < typename T >
+    template <typename T>
     using type = HoldCudaDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type, // ITERABLE
-        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
-        index_type, Args...>;
+        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
+        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
+        index_type,
+        Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -244,21 +254,25 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::cuda, dispatcher_holder_policy, RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
+  using dispatcher_type =
+      Dispatcher<Platform::cuda,
+                 dispatcher_holder_policy,
+                 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
+                 Args...>;
 
   WorkRunner() = default;
 
   WorkRunner(WorkRunner const&) = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner && o)
+  WorkRunner& operator=(WorkRunner&& o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -268,17 +282,21 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template <typename WorkContainer, typename Iterable, typename LoopBody>
+  inline void enqueue(WorkContainer& storage,
+                      Iterable&& iter,
+                      LoopBody&& loop_body)
   {
-    using Iterator  = camp::decay<decltype(std::begin(iter))>;
+    using Iterator = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
-    using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using ITERABLE = camp::decay<Iterable>;
+    using IndexType =
+        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template
+    // true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
     Iterator end = std::end(iter);
@@ -290,30 +308,41 @@ struct WorkRunner<
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup
+      // reductions
       //
       // LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream,
+      //     std::forward<LoopBody>(loop_body));
 
-      storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
-          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+      storage.template emplace<holder>(get_Dispatcher<holder, dispatcher_type>(
+                                           dispatcher_exec_policy{}),
+                                       std::forward<Iterable>(iter),
+                                       std::forward<LoopBody>(loop_body));
     }
   }
 
   // no extra storage required here
   using per_run_storage = int;
 
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  template <typename WorkContainer>
+  per_run_storage run(WorkContainer const& storage,
+                      resource_type r,
+                      Args... args) const
   {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using Iterator = camp::decay<decltype(std::begin(storage))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(storage),
+                                                         std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
     per_run_storage run_storage{};
 
-    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
+    auto func = cuda_unordered_y_block_global<BLOCK_SIZE,
+                                              BLOCKS_PER_SM,
+                                              Iterator,
+                                              value_type,
+                                              index_type,
+                                              Args...>;
 
     //
     // Compute the requested iteration space size
@@ -325,14 +354,17 @@ struct WorkRunner<
     // Only launch kernel if we have something to iterate over
     if (num_loops > 0 && BLOCK_SIZE > 0) {
 
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations =
+          m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
       cuda_dim_t blockSize{static_cast<cuda_dim_member_t>(block_size), 1, 1};
-      cuda_dim_t gridSize{static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) / block_size),
+      cuda_dim_t gridSize{static_cast<cuda_dim_member_t>(
+                              (average_iterations + block_size - 1) /
+                              block_size),
                           static_cast<cuda_dim_member_t>(num_loops),
                           1};
 
@@ -347,8 +379,9 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        void* func_args[] = {(void*)&begin, (void*)&args...};
+        RAJA::cuda::launch(
+            (const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -358,10 +391,7 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
+  void clear() { m_total_iterations = 0; }
 
 private:
   index_type m_total_iterations = 0;
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index aedfe91a03..a814a9297d 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -25,7 +25,8 @@
 #include <stdexcept>
 #include <type_traits>
 
-#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6
+#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 && \
+    __CUDACC_VER_MINOR__ >= 6
 #define RAJA_ENABLE_CUDA_ATOMIC_REF
 #endif
 
@@ -33,10 +34,9 @@
 #include <cuda/atomic>
 #endif
 
-#include "camp/list.hpp"
-
-#include "RAJA/policy/sequential/atomic.hpp"
 #include "RAJA/policy/atomic_builtin.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
+#include "camp/list.hpp"
 #if defined(RAJA_ENABLE_OPENMP)
 #include "RAJA/policy/openmp/atomic.hpp"
 #endif
@@ -66,10 +66,9 @@ namespace detail
  */
 template <typename T>
 struct cuda_useBuiltinCommon {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value;
 };
 
 
@@ -82,14 +81,13 @@ struct cuda_useBuiltinCommon {
  */
 template <typename T>
 struct cuda_useReinterpretCommon {
-  static constexpr bool value =
-    !cuda_useBuiltinCommon<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+  static constexpr bool value = !cuda_useBuiltinCommon<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 
@@ -125,11 +123,10 @@ RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
  */
 template <typename T>
 struct cuda_useBuiltinExchange {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value ||
-    std::is_same<T, float>::value;
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value ||
+                                std::is_same<T, float>::value;
 };
 
 /*!
@@ -138,21 +135,21 @@ struct cuda_useBuiltinExchange {
  */
 template <typename T>
 struct cuda_useReinterpretExchange {
-  static constexpr bool value =
-    !cuda_useBuiltinExchange<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+  static constexpr bool value = !cuda_useBuiltinExchange<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
 template <typename T>
-using cuda_useReinterpretExchange_t = typename cuda_useReinterpretExchange<T>::type;
+using cuda_useReinterpretExchange_t =
+    typename cuda_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
@@ -176,8 +173,8 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
   using R = cuda_useReinterpretExchange_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicExchange(reinterpret_cast<R*>(acc),
-                        RAJA::util::reinterp_A_as_B<T, R>(value)));
+      cuda_atomicExchange(reinterpret_cast<R *>(acc),
+                          RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -190,7 +187,7 @@ template <typename T>
 RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
 {
   return cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).load(
-    cuda::memory_order_relaxed{});
+      cuda::memory_order_relaxed{});
 }
 
 
@@ -198,7 +195,7 @@ template <typename T>
 RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
 {
   cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).store(
-    value, cuda::memory_order_relaxed{});
+      value, cuda::memory_order_relaxed{});
 }
 
 #else
@@ -217,7 +214,7 @@ RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
   using R = cuda_useReinterpretCommon_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicLoad(reinterpret_cast<R*>(acc)));
+      cuda_atomicLoad(reinterpret_cast<R *>(acc)));
 }
 
 template <typename T>
@@ -241,11 +238,10 @@ template <typename T>
 struct cuda_useBuiltinCAS {
   static constexpr bool value =
 #if __CUDA_ARCH__ >= 700
-    std::is_same<T, unsigned short int>::value ||
+      std::is_same<T, unsigned short int>::value ||
 #endif
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
+      std::is_same<T, unsigned long long>::value;
 };
 
 /*!
@@ -255,28 +251,26 @@ struct cuda_useBuiltinCAS {
  */
 template <typename T>
 struct cuda_useReinterpretCAS {
-  static constexpr bool value =
-    !cuda_useBuiltinCAS<T>::value &&
-    (
+  static constexpr bool value = !cuda_useBuiltinCAS<T>::value &&
+                                (
 #if __CUDA_ARCH__ >= 700
-     sizeof(T) == sizeof(unsigned short) ||
+                                    sizeof(T) == sizeof(unsigned short) ||
 #endif
-     sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long)
-    );
+                                    sizeof(T) == sizeof(unsigned int) ||
+                                    sizeof(T) == sizeof(unsigned long long));
 
   using type =
 #if __CUDA_ARCH__ >= 700
-    std::conditional_t<sizeof(T) == sizeof(unsigned short),
-                       unsigned short,
+      std::conditional_t<sizeof(T) == sizeof(unsigned short),
+                         unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int,
-                       unsigned long long>
+                         std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                            unsigned int,
+                                            unsigned long long>
 #if __CUDA_ARCH__ >= 700
-                      >
+                         >
 #endif
-    ;
+      ;
 };
 
 /*!
@@ -299,9 +293,9 @@ RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
   using R = cuda_useReinterpretCAS_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicCAS(reinterpret_cast<R*>(acc),
-                   RAJA::util::reinterp_A_as_B<T, R>(compare),
-                   RAJA::util::reinterp_A_as_B<T, R>(value)));
+      cuda_atomicCAS(reinterpret_cast<R *>(acc),
+                     RAJA::util::reinterp_A_as_B<T, R>(compare),
+                     RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 /*!
@@ -311,14 +305,14 @@ RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
+RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T &a, const T &b)
 {
   return a == b;
 }
 
 template <typename T,
           std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
+RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T &a, const T &b)
 {
   using R = cuda_useReinterpretCommon_t<T>;
 
@@ -334,8 +328,7 @@ RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
  * operation.
  */
 template <typename T, typename Oper>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
-                                             Oper&& oper)
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc, Oper &&oper)
 {
   T old = cuda_atomicLoad(acc);
   T expected;
@@ -349,15 +342,15 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
 }
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
- * Implementation uses the existing CUDA supplied unsigned 32-bit or 64-bit CAS
- * operator. Returns the OLD value that was replaced by the result of this
- * operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with
+ * short-circuiting. Implementation uses the existing CUDA supplied unsigned
+ * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
+ * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
 RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
-                                             Oper&& oper,
-                                             ShortCircuit&& sc)
+                                             Oper &&oper,
+                                             ShortCircuit &&sc)
 {
   T old = cuda_atomicLoad(acc);
 
@@ -379,28 +372,27 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
 /*!
  * Atomic addition
  */
-using cuda_atomicAdd_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long int,
-  float
+using cuda_atomicAdd_builtin_types = ::camp::list<int,
+                                                  unsigned int,
+                                                  unsigned long long int,
+                                                  float
 #if __CUDA_ARCH__ >= 600
-  ,
-  double
+                                                  ,
+                                                  double
 #endif
->;
+                                                  >;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
 {
   return ::atomicAdd(acc, value);
@@ -412,38 +404,38 @@ RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
  */
 using cuda_atomicSub_builtin_types = cuda_atomicAdd_builtin_types;
 
-using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<
-  int,
-  unsigned int
->;
+using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
-using cuda_atomicSub_via_Add_builtin_types = ::camp::list<
-  unsigned long long int,
-  float
+using cuda_atomicSub_via_Add_builtin_types =
+    ::camp::list<unsigned long long int,
+                 float
 #if __CUDA_ARCH__ >= 600
-  ,
-  double
+                 ,
+                 double
 #endif
->;
+                 >;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types> * =
+        nullptr>
 RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 {
   return ::atomicSub(acc, value);
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types> * =
+        nullptr>
 RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 {
   return ::atomicAdd(acc, -value);
@@ -453,36 +445,33 @@ RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 /*!
  * Atomic min/max
  */
-using cuda_atomicMinMax_builtin_types = ::camp::list<
-  int,
-  unsigned int
+using cuda_atomicMinMax_builtin_types = ::camp::list<int,
+                                                     unsigned int
 #if __CUDA_ARCH__ >= 500
-  ,
-  long long int,
-  unsigned long long int
+                                                     ,
+                                                     long long int,
+                                                     unsigned long long int
 #endif
->;
+                                                     >;
 
 
 /*!
  * Atomic min
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>
+              * = nullptr>
 RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
 {
   return cuda_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc,
+      [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>
+              * = nullptr>
 RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
 {
   return ::atomicMin(acc, value);
@@ -493,21 +482,19 @@ RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
  * Atomic max
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>
+              * = nullptr>
 RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
 {
   return cuda_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc,
+      [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>
+              * = nullptr>
 RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
 {
   return ::atomicMax(acc, value);
@@ -517,27 +504,29 @@ RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
 /*!
  * Atomic increment/decrement with reset
  */
-using cuda_atomicIncDecReset_builtin_types = ::camp::list<
-  unsigned int
->;
+using cuda_atomicIncDecReset_builtin_types = ::camp::list<unsigned int>;
 
 
 /*!
  * Atomic increment with reset
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>
+        * = nullptr>
 RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
+  return cuda_atomicCAS_loop(acc, [value](T old) {
     return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types> * =
+        nullptr>
 RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
 {
   return ::atomicInc(acc, value);
@@ -557,19 +546,24 @@ RAJA_INLINE __device__ T cuda_atomicInc(T *acc)
 /*!
  * Atomic decrement with reset
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>
+        * = nullptr>
 RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  return cuda_atomicCAS_loop(acc, [value](T old) {
+    return old == static_cast<T>(0) || value < old ? value
+                                                   : old - static_cast<T>(1);
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types> * =
+        nullptr>
 RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
 {
   return ::atomicDec(acc, value);
@@ -589,27 +583,24 @@ RAJA_INLINE __device__ T cuda_atomicDec(T *acc)
 /*!
  * Atomic bitwise functions (and, or, xor)
  */
-using cuda_atomicBit_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long int
->;
+using cuda_atomicBit_builtin_types =
+    ::camp::list<int, unsigned int, unsigned long long int>;
 
 
 /*!
  * Atomic and
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
 {
   return ::atomicAnd(acc, value);
@@ -620,12 +611,11 @@ RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
  * Atomic or
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 /*!
@@ -638,16 +628,16 @@ RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
  * Atomic xor
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
 {
   return ::atomicXor(acc, value);
@@ -667,8 +657,8 @@ RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(cuda_atomic_explicit<host_policy>,
+                                          T *acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicLoad(acc);
@@ -679,8 +669,9 @@ atomicLoad(cuda_atomic_explicit<host_policy>, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(cuda_atomic_explicit<host_policy>,
+                                              T *acc,
+                                              T value)
 {
 #ifdef __CUDA_ARCH__
   detail::cuda_atomicStore(acc, value);
@@ -691,8 +682,9 @@ atomicStore(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(cuda_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAdd(acc, value);
@@ -703,8 +695,9 @@ atomicAdd(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(cuda_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicSub(acc, value);
@@ -715,8 +708,9 @@ atomicSub(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(cuda_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMin(acc, value);
@@ -727,8 +721,9 @@ atomicMin(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(cuda_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMax(acc, value);
@@ -739,8 +734,9 @@ atomicMax(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
@@ -753,8 +749,8 @@ atomicInc(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T *acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicInc(acc);
@@ -765,8 +761,9 @@ atomicInc(cuda_atomic_explicit<host_policy>, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
@@ -779,8 +776,8 @@ atomicDec(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T *acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicDec(acc);
@@ -791,8 +788,9 @@ atomicDec(cuda_atomic_explicit<host_policy>, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(cuda_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAnd(acc, value);
@@ -803,8 +801,9 @@ atomicAnd(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(cuda_atomic_explicit<host_policy>,
+                                        T *acc,
+                                        T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicOr(acc, value);
@@ -815,8 +814,9 @@ atomicOr(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(cuda_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicXor(acc, value);
@@ -827,8 +827,9 @@ atomicXor(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(cuda_atomic_explicit<host_policy>,
+                                              T *acc,
+                                              T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicExchange(acc, value);
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 493136400c..513951d7d9 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -28,22 +28,16 @@
 
 #include <algorithm>
 
+#include "RAJA/index/IndexSet.hpp"
+#include "RAJA/internal/fault_tolerance.hpp"
 #include "RAJA/pattern/forall.hpp"
-
 #include "RAJA/pattern/params/forall.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/internal/fault_tolerance.hpp"
-
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/policy/cuda/policy.hpp"
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
-
-#include "RAJA/index/IndexSet.hpp"
-
+#include "RAJA/util/macros.hpp"
 #include "RAJA/util/resource.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -70,61 +64,87 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or named_usage::unspecified with "
+                "forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or named_usage::unspecified with "
+                "forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if ( len > (block_size * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (block_size * grid_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_cuda_dim<dim>(dims.threads,
+                                static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_cuda_dim<dim>(dims.blocks,
+                                static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or named_usage::unspecified with "
+                "forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if ( block_size == IdxT(0) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (block_size == IdxT(0)) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
@@ -132,21 +152,30 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or named_usage::unspecified with "
+                "forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
@@ -156,19 +185,24 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker> {
+  using IndexGetter = ::RAJA::cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -177,20 +211,30 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or named_usage::unspecified with "
+                "forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or named_usage::unspecified with "
+                "forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
@@ -200,21 +244,30 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or named_usage::unspecified with "
+                "forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
@@ -224,21 +277,30 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or named_usage::unspecified with "
+                "forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
@@ -248,19 +310,24 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker> {
+  using IndexGetter = ::RAJA::cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -291,14 +358,14 @@ template <typename EXEC_POL,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
@@ -316,14 +383,13 @@ template <typename EXEC_POL,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forall_cuda_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
@@ -342,22 +408,22 @@ template <typename EXEC_POL,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  if (ii < length) {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -370,137 +436,138 @@ template <typename EXEC_POL,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  if (ii < length) {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>()) {
     body(idx[ii]);
   }
 }
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forall_cuda_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>()) {
     body(idx[ii]);
   }
 }
 
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -515,27 +582,43 @@ void forallp_cuda_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                            IterationGetter,
+                                                            Concretizer,
+                                                            BlocksPerSM,
+                                                            Async>;
+  using UniqueMarker = ::camp::
+      list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
+                                                              IterationGetter,
+                                                              Concretizer,
+                                                              UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -547,9 +630,12 @@ forall_impl(resources::Cuda cuda_res,
   // Only launch kernel if we have something to iterate over
   if (len > 0) {
 
-    auto func = reinterpret_cast<const void*>(
-        &impl::forall_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
-                                  IndexType>);
+    auto func =
+        reinterpret_cast<const void*>(&impl::forall_cuda_kernel<EXEC_POL,
+                                                                BlocksPerSM,
+                                                                Iterator,
+                                                                LOOP_BODY,
+                                                                IndexType>);
 
     //
     // Setup shared memory buffers
@@ -568,14 +654,20 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body =
+          RAJA::cuda::make_launch_body(func,
+                                       dims.blocks,
+                                       dims.threads,
+                                       shmem,
+                                       cuda_res,
+                                       std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::cuda::launch(
+          func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
     }
 
     RAJA_FT_END;
@@ -585,27 +677,48 @@ forall_impl(resources::Cuda cuda_res,
 }
 
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async> const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                            IterationGetter,
+                                                            Concretizer,
+                                                            BlocksPerSM,
+                                                            Async>;
+  using UniqueMarker = ::camp::list<IterationMapping,
+                                    IterationGetter,
+                                    camp::num<BlocksPerSM>,
+                                    LOOP_BODY,
+                                    Iterator,
+                                    ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
+                                                              IterationGetter,
+                                                              Concretizer,
+                                                              UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -618,8 +731,12 @@ forall_impl(resources::Cuda cuda_res,
   if (len > 0) {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_cuda_kernel< EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
-                                   IndexType, camp::decay<ForallParam> >);
+        &impl::forallp_cuda_kernel<EXEC_POL,
+                                   BlocksPerSM,
+                                   Iterator,
+                                   LOOP_BODY,
+                                   IndexType,
+                                   camp::decay<ForallParam>>);
 
     //
     // Setup shared memory buffers
@@ -645,14 +762,23 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body =
+          RAJA::cuda::make_launch_body(func,
+                                       dims.blocks,
+                                       dims.threads,
+                                       shmem,
+                                       cuda_res,
+                                       std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {(void*)&body,
+                      (void*)&begin,
+                      (void*)&len,
+                      (void*)&f_params};
+      RAJA::cuda::launch(
+          func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -683,21 +809,33 @@ forall_impl(resources::Cuda cuda_res,
  ******************************************************************************
  */
 template <typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-forall_impl(resources::Cuda r,
-            ExecPolicy<seq_segit, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
+    resources::Cuda r,
+    ExecPolicy<seq_segit,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BlocksPerSM,
+                                                        Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi) {
     iset.segmentCall(r,
                      isi,
                      detail::CallForall(),
-                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, true>(),
+                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                              IterationGetter,
+                                                              Concretizer,
+                                                              BlocksPerSM,
+                                                              true>(),
                      loop_body);
   }  // iterate over segments of index set
 
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index b2daa3a23e..fe66f7fd63 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -25,15 +25,14 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 
-#include <type_traits>
-
 #include <cuda.h>
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/SoAArray.hpp"
-#include "RAJA/util/types.hpp"
+#include <type_traits>
 
 #include "RAJA/policy/cuda/policy.hpp"
+#include "RAJA/util/SoAArray.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 
 
 namespace RAJA
@@ -57,17 +56,10 @@ namespace impl
  *       so device scope fences are required to make memory accesses visible
  *       to the whole device.
  */
-struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
-{
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor {
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 /*!
@@ -90,20 +82,21 @@ struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
  *
  ******************************************************************************
  */
-struct AccessorDeviceScopeUseBlockFence
-{
+struct AccessorDeviceScopeUseBlockFence {
   // cuda has 32 and 64 bit atomics
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::
+        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
+    auto ptr = const_cast<integer_type*>(
+        reinterpret_cast<const integer_type*>(in_ptr + idx));
 
     for (size_t i = 0; i < u.array_size(); ++i) {
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
@@ -112,10 +105,11 @@ struct AccessorDeviceScopeUseBlockFence
     return u.get_value();
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::
+        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -127,15 +121,9 @@ struct AccessorDeviceScopeUseBlockFence
     }
   }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 
@@ -160,7 +148,9 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::
+      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
+          u;
   u.set_value(var);
 
   for (size_t i = 0; i < u.array_size(); ++i) {
@@ -176,7 +166,9 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::
+      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
+          u;
   u.set_value(var);
 
   for (size_t i = 0; i < u.array_size(); ++i) {
@@ -198,7 +190,9 @@ RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(
+    unsigned int var,
+    int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -210,19 +204,24 @@ RAJA_DEVICE RAJA_INLINE long shfl_xor_sync<long>(long var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(
+    unsigned long var,
+    int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var,
+                                                           int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(
+    unsigned long long var,
+    int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -265,7 +264,8 @@ RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var,
+                                                             int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -277,19 +277,24 @@ RAJA_DEVICE RAJA_INLINE long shfl_sync<long>(long var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(unsigned long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(
+    unsigned long var,
+    int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var,
+                                                       int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(
+    unsigned long long var,
+    int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -417,16 +422,22 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   // reduce per warp values
   if (numThreads > policy::cuda::device_constants.WARP_SIZE) {
 
-    static_assert(policy::cuda::device_constants.MAX_WARPS <= policy::cuda::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
+    static_assert(policy::cuda::device_constants.MAX_WARPS <=
+                      policy::cuda::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
     RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
+        reinterpret_cast<
+            RAJA::detail::SoAArray<T,
+                                   policy::cuda::device_constants.MAX_WARPS>*>(
+            tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0) {
diff --git a/include/RAJA/policy/cuda/kernel/Conditional.hpp b/include/RAJA/policy/cuda/kernel/Conditional.hpp
index ff15848bcb..e315480d32 100644
--- a/include/RAJA/policy/cuda/kernel/Conditional.hpp
+++ b/include/RAJA/policy/cuda/kernel/Conditional.hpp
@@ -18,17 +18,14 @@
 #ifndef RAJA_policy_cuda_kernel_Conditional_HPP
 #define RAJA_policy_cuda_kernel_Conditional_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/Conditional.hpp"
-
 #include "RAJA/policy/cuda/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -48,10 +45,7 @@ struct CudaStatementExecutor<Data,
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     if (Conditional::eval(data)) {
 
@@ -61,10 +55,7 @@ struct CudaStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 7465f515b0..616abad871 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -26,19 +26,15 @@
 #include <cassert>
 #include <climits>
 
-#include "camp/camp.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
 #include "RAJA/pattern/kernel.hpp"
 #include "RAJA/pattern/kernel/For.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
-
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
-#include "RAJA/policy/cuda/policy.hpp"
-
 #include "RAJA/policy/cuda/kernel/internal.hpp"
+#include "RAJA/policy/cuda/policy.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -51,7 +47,8 @@ namespace RAJA
  * Blocks per SM must be chosen by the user.
  */
 template <bool async0, int num_blocks, int num_threads, int blocks_per_sm>
-struct cuda_explicit_launch {};
+struct cuda_explicit_launch {
+};
 
 /*!
  * CUDA kernel launch policy where the user specifies the number of physical
@@ -67,7 +64,10 @@ struct cuda_explicit_launch {};
  * Blocks per SM defaults to 1.
  */
 template <bool async0, int num_blocks, int num_threads>
-using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch = cuda_explicit_launch<async0,
+                                         num_blocks,
+                                         num_threads,
+                                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 /*!
  * CUDA kernel launch policy where the number of physical blocks and threads
@@ -75,7 +75,11 @@ using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <int num_threads0, bool async0>
-using cuda_occ_calc_launch = cuda_explicit_launch<async0, 0, num_threads0, policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_occ_calc_launch =
+    cuda_explicit_launch<async0,
+                         0,
+                         num_threads0,
+                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 namespace statement
 {
@@ -87,7 +91,10 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct CudaKernelExt
-    : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, void, 0, true>, EnclosedStmts...> {
+    : public internal::Statement<
+          ::RAJA::policy::cuda::
+              cuda_exec_explicit<LaunchConfig, void, void, 0, true>,
+          EnclosedStmts...> {
 };
 
 
@@ -98,8 +105,8 @@ struct CudaKernelExt
  * The kernel launch is synchronous.
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using CudaKernelExp =
-    CudaKernelExt<cuda_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
+using CudaKernelExp = CudaKernelExt<cuda_launch<false, num_blocks, num_threads>,
+                                    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with the flexibility
@@ -135,9 +142,9 @@ using CudaKernelOccAsync =
  * The kernel launch is synchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using CudaKernelFixed =
-    CudaKernelExt<cuda_launch<false, operators::limits<int>::max(), num_threads>,
-                  EnclosedStmts...>;
+using CudaKernelFixed = CudaKernelExt<
+    cuda_launch<false, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with a fixed
@@ -156,7 +163,10 @@ using CudaKernelFixedAsync =
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSM =
-    CudaKernelExt<cuda_explicit_launch<false, operators::limits<int>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<false,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -166,7 +176,10 @@ using CudaKernelFixedSM =
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSMAsync =
-    CudaKernelExt<cuda_explicit_launch<true, operators::limits<int>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<true,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -231,13 +244,17 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
-struct CudaKernelLauncherGetter
-{
-  using type = camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>)>;
+template <int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
+struct CudaKernelLauncherGetter {
+  using type =
+      camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize,
+                                                              BlocksPerSM,
+                                                              Data,
+                                                              executor_t>)>;
   static constexpr type get() noexcept
   {
-    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
+    return &internal::
+        CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
   }
 };
 
@@ -245,10 +262,10 @@ struct CudaKernelLauncherGetter
  * Helper class specialization for BlockSize == 0 and gets the unfixed max
  * block size version of the kernel.
  */
-template<typename Data, typename executor_t>
-struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
-{
-  using type = camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
+template <typename Data, typename executor_t>
+struct CudaKernelLauncherGetter<0, 0, Data, executor_t> {
+  using type =
+      camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::CudaKernelLauncher<Data, executor_t>;
@@ -256,12 +273,14 @@ struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
 };
 
 
-
 /*!
  * Helper class that handles CUDA kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct CudaLaunchHelper;
 
 
@@ -270,24 +289,39 @@ struct CudaLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the CUDA occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, int blocks_per_sm, typename StmtList, typename Data, typename Types>
-struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
-{
+template <bool async0,
+          int num_blocks,
+          int num_threads,
+          int blocks_per_sm,
+          typename StmtList,
+          typename Data,
+          typename Types>
+struct CudaLaunchHelper<
+    cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,
+    StmtList,
+    Data,
+    Types> {
   using Self = CudaLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>;
+  using kernelGetter_t =
+      CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                               (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
+                               Data,
+                               executor_t>;
 
-  inline static const void* get_func()
+  inline static const void *get_func()
   {
-    return reinterpret_cast<const void*>(kernelGetter_t::get());
+    return reinterpret_cast<const void *>(kernelGetter_t::get());
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-      int &recommended_blocks, int &recommended_threads)
+                                                int &recommended_blocks,
+                                                int &recommended_threads)
   {
     auto func = Self::get_func();
 
@@ -299,8 +333,9 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         // determine blocks at runtime
         // determine threads at runtime
         //
-        auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
-            func, shmem_size);
+        auto data =
+            ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(func,
+                                                                  shmem_size);
         recommended_blocks = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
 
@@ -314,8 +349,8 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
 
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
+        recommended_blocks =
+            data.func_max_blocks_per_sm * data.device_sm_per_device;
       }
 
     } else {
@@ -334,18 +369,17 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
-
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
-
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int &max_threads)
   {
     if (num_threads <= 0) {
 
@@ -361,12 +395,12 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
       // threads determined at compile-time
       //
       max_threads = num_threads;
-
     }
   }
 
   inline static void max_blocks(size_t shmem_size,
-      int &max_blocks, int actual_threads)
+                                int &max_blocks,
+                                int actual_threads)
   {
     auto func = Self::get_func();
 
@@ -375,14 +409,15 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 ||
-          num_threads != actual_threads) {
+      if (num_threads <= 0 || num_threads != actual_threads) {
 
         //
         // determine blocks when actual_threads != num_threads
         //
-        auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
-            func, shmem_size, actual_threads);
+        auto data =
+            ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(func,
+                                                          shmem_size,
+                                                          actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       } else {
@@ -393,7 +428,6 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
       }
 
     } else {
@@ -402,7 +436,6 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
-
     }
   }
 };
@@ -416,8 +449,10 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline
-cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t minimum = cuda_dim_t()){
+inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
+                              cuda_dim_t result,
+                              cuda_dim_t minimum = cuda_dim_t())
+{
 
 
   // clamp things to at least 1
@@ -430,12 +465,12 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if(result.x * result.y * result.z <= limit) return result;
+  if (result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if(result.x * result.y * minimum.z < limit){
+  if (result.x * result.y * minimum.z < limit) {
     // compute a new z
-    result.z = limit / (result.x*result.y);
+    result.z = limit / (result.x * result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -443,9 +478,9 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
 
 
   // Can we reduce y to fit?
-  if(result.x * minimum.y * result.z < limit){
+  if (result.x * minimum.y * result.z < limit) {
     // compute a new y
-    result.y = limit / (result.x*result.z);
+    result.y = limit / (result.x * result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -453,9 +488,9 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
 
 
   // Can we reduce y to fit?
-  if(minimum.x * result.y * result.z < limit){
+  if (minimum.x * result.y * result.z < limit) {
     // compute a new x
-    result.x = limit / (result.y*result.z);
+    result.x = limit / (result.y * result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -470,7 +505,8 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
@@ -481,7 +517,8 @@ struct StatementExecutor<
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = CudaLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -510,8 +547,9 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem,
+                                           recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -524,24 +562,24 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      cuda_dim_t fit_threads{0,0,0};
+      cuda_dim_t fit_threads{0, 0, 0};
 
-      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
-
-        fit_threads = fitCudaDims(
-            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads >= get_size(launch_dims.min_dims.threads)) {
 
+        fit_threads = fitCudaDims(recommended_threads,
+                                  launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if ( recommended_threads < max_threads &&
-           get_size(fit_threads) != recommended_threads ) {
-
-        fit_threads = fitCudaDims(
-            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads < max_threads &&
+          get_size(fit_threads) != recommended_threads) {
 
+        fit_threads = fitCudaDims(max_threads,
+                                  launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -555,7 +593,7 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if ( launch_dims.num_threads() == recommended_threads ) {
+      if (launch_dims.num_threads() == recommended_threads) {
 
         //
         // Fit the requested blocks
@@ -568,11 +606,11 @@ struct StatementExecutor<
         // Fit the max blocks
         //
         use_blocks = max_blocks;
-
       }
 
-      launch_dims.dims.blocks = fitCudaDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitCudaDims(use_blocks,
+                                            launch_dims.dims.blocks,
+                                            launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -581,7 +619,7 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if(launch_dims.num_threads() > max_threads){
+      if (launch_dims.num_threads() > max_threads) {
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -596,13 +634,23 @@ struct StatementExecutor<
         // currently an unresolved issue.
         //
         auto cuda_data = RAJA::cuda::make_launch_body(func,
-            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
+                                                      launch_dims.dims.blocks,
+                                                      launch_dims.dims.threads,
+                                                      shmem,
+                                                      res,
+                                                      data);
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&cuda_data};
-        RAJA::cuda::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
+        void *args[] = {(void *)&cuda_data};
+        RAJA::cuda::launch(func,
+                           launch_dims.dims.blocks,
+                           launch_dims.dims.threads,
+                           args,
+                           shmem,
+                           res,
+                           launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 7a6d10f4ec..2fbd9d6d23 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -20,7 +20,6 @@
 #define RAJA_policy_cuda_kernel_For_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/policy/cuda/kernel/internal.hpp"
 
 
@@ -44,9 +43,11 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                   EnclosedStmts...>,
+    statement::For<
+        ArgumentId,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -60,10 +61,10 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t i = IndexMapper::template index<diff_t>();
 
@@ -74,8 +75,7 @@ struct CudaStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -104,7 +104,9 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -119,10 +121,10 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i = IndexMapper::template index<diff_t>();
@@ -138,8 +140,7 @@ struct CudaStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -167,7 +168,10 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -182,11 +186,13 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -210,8 +216,7 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
@@ -239,7 +244,10 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -254,11 +262,13 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -277,8 +287,7 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -304,13 +313,18 @@ struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-: CudaStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
+    : CudaStatementExecutor<
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types> {
 };
 
 
@@ -322,33 +336,31 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_direct<Mask>,
+                                            EnclosedStmts...>,
+                             Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -358,13 +370,11 @@ struct CudaStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -379,7 +389,7 @@ struct CudaStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -391,38 +401,36 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
     for (diff_t ii = 0; ii < len; ii += i_stride) {
@@ -441,9 +449,7 @@ struct CudaStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -458,7 +464,7 @@ struct CudaStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -470,30 +476,28 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+    Data,
+    statement::For<ArgumentId,
+                   RAJA::cuda_thread_masked_direct<Mask>,
+                   EnclosedStmts...>,
+    Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -503,13 +507,11 @@ struct CudaStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -525,7 +527,7 @@ struct CudaStatementExecutor<
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -537,36 +539,33 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_thread_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
     for (diff_t ii = 0; ii < len; ii += i_stride) {
@@ -585,9 +584,7 @@ struct CudaStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -603,7 +600,7 @@ struct CudaStatementExecutor<
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 92a59cb9a8..9e34585762 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -20,7 +20,6 @@
 #define RAJA_policy_cuda_kernel_ForICount_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/policy/cuda/kernel/internal.hpp"
 
 
@@ -46,29 +45,37 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::policy::cuda::cuda_indexer<
+                             iteration_mapping::DirectUnchecked,
+                             sync,
+                             IndexMapper>,
+                         EnclosedStmts...>,
+          Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
+                                           sync,
+                                           IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t i = IndexMapper::template index<diff_t>();
@@ -98,29 +105,35 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::
+                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                      sync,
+                                                      IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -154,29 +167,41 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -217,29 +242,41 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -277,13 +314,18 @@ struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-: CudaStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
+    : CudaStatementExecutor<
+          Data,
+          statement::ForICount<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types> {
 };
 
 
@@ -296,40 +338,46 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_warp_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -340,9 +388,8 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -355,45 +402,51 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_warp_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
     for (diff_t ii = 0; ii < len; ii += i_stride) {
@@ -411,7 +464,6 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 
@@ -424,37 +476,42 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_thread_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types> {
 
   using Base = CudaStatementExecutor<
-          Data,
-          statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::cuda_thread_masked_direct<Mask>,
+                     EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -465,9 +522,8 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -480,42 +536,47 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_thread_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
     for (diff_t ii = 0; ii < len; ii += i_stride) {
@@ -533,7 +594,6 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 }  // namespace internal
diff --git a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
index fd33192a65..adb87f6011 100644
--- a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
@@ -18,17 +18,14 @@
 #ifndef RAJA_policy_cuda_kernel_Hyperplane_HPP
 #define RAJA_policy_cuda_kernel_Hyperplane_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "camp/camp.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/Hyperplane.hpp"
-
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -41,33 +38,30 @@ template <typename Data,
           camp::idx_t... Args,
           typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::Hyperplane<HpArgumentId,
-                                                   seq_exec,
-                                                   ArgList<Args...>,
-                                                   EnclosedStmts...>,
-                             Types> {
+struct CudaStatementExecutor<
+    Data,
+    statement::
+        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t =
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len = segment_length<HpArgumentId>(data) +
-                 foldl(RAJA::operators::plus<int>(),
-                               segment_length<Args>(data)...);
+    int hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-        camp::get<Args>(data.offset_tuple)...);
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -93,18 +87,13 @@ struct CudaStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-
-
 }  // end namespace internal
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
index 258cd204d6..cb5346b899 100644
--- a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
@@ -19,16 +19,14 @@
 #ifndef RAJA_policy_cuda_kernel_InitLocalMem_HPP
 #define RAJA_policy_cuda_kernel_InitLocalMem_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/InitLocalMem.hpp"
 #include "RAJA/policy/cuda/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -39,27 +37,31 @@ struct cuda_shared_mem;
 namespace internal
 {
 
-//Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+// Intialize thread shared array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<Data,
                              statement::InitLocalMem<RAJA::cuda_shared_mem,
-                             camp::idx_seq<Indices...>, EnclosedStmts...>,
-                             Types>
-{
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -67,40 +69,35 @@ struct CudaStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -108,47 +105,48 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
-//Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
-{
+// Intialize thread private array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::InitLocalMem<RAJA::cuda_thread_mem,
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -156,40 +154,35 @@ struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -197,31 +190,24 @@ struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Lambda.hpp b/include/RAJA/policy/cuda/kernel/Lambda.hpp
index e932a3e270..8025d4f3c5 100644
--- a/include/RAJA/policy/cuda/kernel/Lambda.hpp
+++ b/include/RAJA/policy/cuda/kernel/Lambda.hpp
@@ -26,13 +26,11 @@
 #include <cassert>
 #include <climits>
 
-#include "camp/camp.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
 #include "RAJA/pattern/kernel.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 
 namespace RAJA
@@ -40,30 +38,32 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct CudaStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types> {
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active) {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const &RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Reduce.hpp b/include/RAJA/policy/cuda/kernel/Reduce.hpp
index 7e46748991..811738f14c 100644
--- a/include/RAJA/policy/cuda/kernel/Reduce.hpp
+++ b/include/RAJA/policy/cuda/kernel/Reduce.hpp
@@ -20,7 +20,6 @@
 #define RAJA_policy_cuda_kernel_Reduce_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/policy/cuda/kernel/internal.hpp"
 
 
@@ -35,7 +34,8 @@ namespace internal
 // Executor that handles reductions across a single CUDA thread block
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
@@ -73,7 +73,7 @@ struct CudaStatementExecutor<Data,
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if(thread_active){
+    if (thread_active) {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -94,7 +94,8 @@ struct CudaStatementExecutor<Data,
 // Executor that handles reductions across a single CUDA thread warp
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
@@ -125,13 +126,12 @@ struct CudaStatementExecutor<Data,
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if(thread_active){
+    if (thread_active) {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -148,7 +148,6 @@ struct CudaStatementExecutor<Data,
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Sync.hpp b/include/RAJA/policy/cuda/kernel/Sync.hpp
index 7dd45d8837..5b15ecc850 100644
--- a/include/RAJA/policy/cuda/kernel/Sync.hpp
+++ b/include/RAJA/policy/cuda/kernel/Sync.hpp
@@ -27,12 +27,10 @@
 #include <cassert>
 #include <climits>
 
-#include "camp/camp.hpp"
-
 #include "RAJA/pattern/kernel.hpp"
-
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 
 namespace RAJA
@@ -60,15 +58,11 @@ namespace internal
 template <typename Data, typename Types>
 struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types> {
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &, bool) { __syncthreads(); }
+  static inline RAJA_DEVICE void exec(Data &, bool) { __syncthreads(); }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const &RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
@@ -77,18 +71,22 @@ struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types> {
 template <typename Data, typename Types>
 struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types> {
 
-  static
-  inline
-  RAJA_DEVICE
+  static inline RAJA_DEVICE
 #if CUDART_VERSION >= 9000
-  void exec(Data &, bool) { __syncwarp(); }
+      void
+      exec(Data &, bool)
+  {
+    __syncwarp();
+  }
 #else
-  void exec(Data &, bool) {  }
+      void
+      exec(Data &, bool)
+  {
+  }
 #endif
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const &RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index a5377f7d7d..4bf2ea8b66 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -27,16 +27,14 @@
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/pattern/kernel/Tile.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 #include "camp/camp.hpp"
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/pattern/kernel/Tile.hpp"
-#include "RAJA/pattern/kernel/internal.hpp"
-
 namespace RAJA
 {
 namespace internal
@@ -56,12 +54,13 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -69,10 +68,11 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -80,7 +80,8 @@ struct CudaStatementExecutor<
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Keep copy of original segment, so we can restore it
     segment_t orig_segment = segment;
@@ -95,12 +96,12 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -116,9 +117,9 @@ struct CudaStatementExecutor<
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
 
-    // NOTE: We do not detect improper uses of direct_unchecked policies under tiling.
-    // This happens when using a direct unchecked policy on a tiled range that is not
-    // evenly divisible by chunk_size.
+    // NOTE: We do not detect improper uses of direct_unchecked policies under
+    // tiling. This happens when using a direct unchecked policy on a tiled
+    // range that is not evenly divisible by chunk_size.
     LaunchDims enclosed_dims =
         enclosed_stmts_t::calculateDimensions(private_data);
 
@@ -142,10 +143,11 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
                     EnclosedStmts...>,
-                    Types>
-  {
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -153,10 +155,11 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -165,7 +168,8 @@ struct CudaStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -184,12 +188,12 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -225,11 +229,15 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -237,10 +245,13 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -251,8 +262,10 @@ struct CudaStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
@@ -274,12 +287,12 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -315,11 +328,15 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -327,10 +344,13 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -341,8 +361,10 @@ struct CudaStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
@@ -359,12 +381,12 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -400,14 +422,21 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
-: CudaStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : CudaStatementExecutor<
+          Data,
+          statement::Tile<
+              ArgumentId,
+              TPol,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types> {
 };
 
 }  // end namespace internal
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index a2de5e2bf3..a30262ce36 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -27,16 +27,14 @@
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/pattern/kernel/Tile.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 #include "camp/camp.hpp"
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/pattern/kernel/Tile.hpp"
-#include "RAJA/pattern/kernel/internal.hpp"
-
 namespace RAJA
 {
 namespace internal
@@ -58,32 +56,40 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          RAJA::policy::cuda::cuda_indexer<
+                              iteration_mapping::DirectUnchecked,
+                              sync,
+                              IndexMapper>,
+                          EnclosedStmts...>,
+          Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
+                                           sync,
+                                           IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -125,32 +131,38 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::
+                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::
+              cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -196,32 +208,44 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -239,7 +263,7 @@ struct CudaStatementExecutor<
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -274,32 +298,44 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -317,7 +353,7 @@ struct CudaStatementExecutor<
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -346,14 +382,23 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
-: CudaStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : CudaStatementExecutor<
+          Data,
+          statement::TileTCount<
+              ArgumentId,
+              ParamId,
+              TPol,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types> {
 };
 
 }  // end namespace internal
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 6e3ddcdde8..9888bf38a2 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -27,15 +27,12 @@
 #include <cassert>
 #include <climits>
 
-#include "camp/camp.hpp"
-
 #include "RAJA/pattern/kernel.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/policy/cuda/policy.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 
 namespace RAJA
@@ -54,19 +51,16 @@ struct LaunchDims {
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(CudaDims _dims)
-    : dims{_dims}
-    , min_dims{}
-  { }
+  LaunchDims(CudaDims _dims) : dims{_dims}, min_dims{} {}
 
   RAJA_INLINE
   LaunchDims(CudaDims _dims, CudaDims _min_dims)
-    : dims{_dims}
-    , min_dims{_min_dims}
-  { }
+      : dims{_dims}, min_dims{_min_dims}
+  {
+  }
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -82,38 +76,38 @@ struct LaunchDims {
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x =
+        std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y =
+        std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z =
+        std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return dims.num_blocks();
-  }
+  int num_blocks() const { return dims.num_blocks(); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return dims.num_threads();
-  }
+  int num_threads() const { return dims.num_threads(); }
 
 
   RAJA_INLINE
-  void clamp_to_min_blocks() {
+  void clamp_to_min_blocks()
+  {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads() {
+  void clamp_to_min_threads()
+  {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
-
 };
 
 
@@ -126,7 +120,7 @@ struct CudaStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -137,7 +131,7 @@ struct CudaStatementListExecutorHelper {
 
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -154,13 +148,13 @@ template <camp::idx_t num_stmts, typename StmtList>
 struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, bool)
+  inline static RAJA_DEVICE void exec(Data&, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
@@ -182,20 +176,15 @@ struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute statements in order with helper class
-    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
+    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, thread_active);
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
     return CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
@@ -205,79 +194,89 @@ struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
 
 template <typename StmtList, typename Data, typename Types>
-using cuda_statement_list_executor_t = CudaStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using cuda_statement_list_executor_t =
+    CudaStatementListExecutor<Data, StmtList, Types>;
 
 
 // specialization for direct sequential policies
-template<typename kernel_indexer>
+template <typename kernel_indexer>
 struct KernelDimensionCalculator;
 
 // specialization for direct unchecked sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
-  {
-    if ( len != static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the directly mapped index space");
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>> {
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
+  {
+    if (len != static_cast<IdxT>(1)) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the directly mapped index space");
     }
   }
 };
 
 // specialization for direct unchecked thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>> {
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len != static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
+    if (len != static_cast<IdxT>(IndexMapper::block_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the direct_unchecked mapped index "
+          "space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct unchecked block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>> {
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -285,36 +284,45 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len != static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
+    if (len != static_cast<IdxT>(IndexMapper::grid_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the direct_unchecked mapped index "
+          "space");
     }
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct unchecked global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>> {
+  using IndexMapper = cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
     if (len != static_cast<IdxT>(0)) {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
@@ -322,141 +330,179 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    const IdxT block_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size));
-    if ( len != (block_size * static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    const IdxT block_size =
+        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size));
+    if (len != (block_size * static_cast<IdxT>(IndexMapper::grid_size))) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the direct_unchecked mapped index "
+          "space");
     }
     set_cuda_dim<dim>(dims.threads, block_size);
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_cuda_dim<dim>(min_dims.threads, block_size);
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    const IdxT grid_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size));
-    if ( len != (static_cast<IdxT>(IndexMapper::block_size) * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
+    const IdxT grid_size =
+        RAJA_DIVIDE_CEILING_INT(len,
+                                static_cast<IdxT>(IndexMapper::block_size));
+    if (len != (static_cast<IdxT>(IndexMapper::block_size) * grid_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the direct_unchecked mapped index "
+          "space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, grid_size);
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(min_dims.blocks, grid_size);
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len != (static_cast<IdxT>(IndexMapper::block_size) *
-                 static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
+    if (len != (static_cast<IdxT>(IndexMapper::block_size) *
+                static_cast<IdxT>(IndexMapper::grid_size))) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the direct_unchecked mapped index "
+          "space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for direct sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
-  {
-    if ( len > static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>> {
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
+  {
+    if (len > static_cast<IdxT>(1)) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
   }
 };
 
 // specialization for direct thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>> {
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::block_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>> {
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -464,36 +510,44 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::grid_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>> {
+  using IndexMapper = cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
     if (len > static_cast<IdxT>(0)) {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
@@ -501,127 +555,165 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
-                static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
+               static_cast<IdxT>(IndexMapper::grid_size))) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for strided loop sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>> {
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
   {
   }
 };
 
 // specialization for strided loop thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>> {
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& dims,
+                             CudaDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>> {
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -629,32 +721,39 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& dims,
+                             CudaDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
-{
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>> {
+  using IndexMapper = cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     if (len > static_cast<IdxT>(0)) {
@@ -666,62 +765,86 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& dims,
+                             CudaDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 574899f408..450080ebbb 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -18,10 +18,10 @@
 #ifndef RAJA_pattern_launch_cuda_HPP
 #define RAJA_pattern_launch_cuda_HPP
 
-#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/pattern/detail/privatizer.hpp"
-#include "RAJA/policy/cuda/policy.hpp"
+#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#include "RAJA/policy/cuda/policy.hpp"
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 #include "RAJA/util/resource.hpp"
 
@@ -35,9 +35,9 @@ __global__ void launch_global_fcn(BODY body_in)
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto &body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -45,38 +45,46 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto &body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>> {
+struct LaunchExecute<
+    RAJA::policy::cuda::cuda_launch_explicit_t<async,
+                                               named_usage::unspecified,
+                                               named_usage::unspecified>> {
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void *>(&launch_global_fcn<BODY>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -84,18 +92,19 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+    cuda_dim_t gridSize{static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                        static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                        static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+    cuda_dim_t blockSize{
+        static_cast<cuda_dim_member_t>(params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero) {
 
       RAJA_FT_BEGIN;
 
@@ -105,14 +114,26 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body =
+            RAJA::cuda::make_launch_body(func,
+                                         gridSize,
+                                         blockSize,
+                                         shared_mem_size,
+                                         cuda_res,
+                                         std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void *args[] = {(void *)&body};
+        RAJA::cuda::launch(func,
+                           gridSize,
+                           blockSize,
+                           args,
+                           shared_mem_size,
+                           cuda_res,
+                           async,
+                           kernel_name);
       }
 
       RAJA_FT_END;
@@ -121,17 +142,22 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &launch_params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams &launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
+    auto func = reinterpret_cast<const void *>(
         &launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
@@ -140,18 +166,20 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
+    cuda_dim_t gridSize{
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
+    cuda_dim_t blockSize{
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero) {
 
       RAJA_FT_BEGIN;
 
@@ -163,22 +191,39 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
       launch_info.res = cuda_res;
 
       {
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
+            async,
+            named_usage::unspecified,
+            named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body =
+            RAJA::cuda::make_launch_body(func,
+                                         gridSize,
+                                         blockSize,
+                                         shared_mem_size,
+                                         cuda_res,
+                                         std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
-
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        void *args[] = {(void *)&body, (void *)&launch_reducers};
+        RAJA::cuda::launch(func,
+                           gridSize,
+                           blockSize,
+                           args,
+                           shared_mem_size,
+                           cuda_res,
+                           async,
+                           kernel_name);
+
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -186,60 +231,70 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-void launch_global_fcn_fixed(BODY body_in)
+    void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto &body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
   body(ctx);
 }
 
-template <typename BODY, int num_threads, size_t BLOCKS_PER_SM, typename ReduceParams>
+template <typename BODY,
+          int num_threads,
+          size_t BLOCKS_PER_SM,
+          typename ReduceParams>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto &body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async, int nthreads, size_t BLOCKS_PER_SM>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+struct LaunchExecute<
+    RAJA::policy::cuda::
+        cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
+    auto func = reinterpret_cast<const void *>(
         &launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
@@ -248,18 +303,19 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+    cuda_dim_t gridSize{static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                        static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                        static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+    cuda_dim_t blockSize{
+        static_cast<cuda_dim_member_t>(params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero) {
 
       RAJA_FT_BEGIN;
 
@@ -269,14 +325,26 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body =
+            RAJA::cuda::make_launch_body(func,
+                                         gridSize,
+                                         blockSize,
+                                         shared_mem_size,
+                                         cuda_res,
+                                         std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void *args[] = {(void *)&body};
+        RAJA::cuda::launch(func,
+                           gridSize,
+                           blockSize,
+                           args,
+                           shared_mem_size,
+                           cuda_res,
+                           async,
+                           kernel_name);
       }
 
       RAJA_FT_END;
@@ -285,18 +353,26 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &launch_params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams &launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM, camp::decay<ReduceParams>>);
+    auto func = reinterpret_cast<const void *>(
+        &launch_new_reduce_global_fcn_fixed<BODY,
+                                            nthreads,
+                                            BLOCKS_PER_SM,
+                                            camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -304,18 +380,20 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
+    cuda_dim_t gridSize{
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
+    cuda_dim_t blockSize{
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero) {
 
       RAJA_FT_BEGIN;
 
@@ -327,22 +405,37 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
       launch_info.res = cuda_res;
 
       {
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL = RAJA::policy::cuda::
+            cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body =
+            RAJA::cuda::make_launch_body(func,
+                                         gridSize,
+                                         blockSize,
+                                         shared_mem_size,
+                                         cuda_res,
+                                         std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
-
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        void *args[] = {(void *)&body, (void *)&launch_reducers};
+        RAJA::cuda::launch(func,
+                           gridSize,
+                           blockSize,
+                           args,
+                           shared_mem_size,
+                           cuda_res,
+                           async,
+                           kernel_name);
+
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -350,7 +443,6 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
@@ -358,18 +450,20 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
    CUDA generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t i = IndexMapper::template index<diff_t>();
 
@@ -378,13 +472,15 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -400,15 +496,20 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -429,18 +530,20 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i = IndexMapper::template index<diff_t>();
@@ -452,13 +555,15 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -479,15 +584,20 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -514,18 +624,21 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init = IndexMapper::template index<diff_t>();
@@ -538,13 +651,16 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -566,22 +682,27 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 
       for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -623,12 +744,14 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
    CUDA generic loop_icount implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -642,13 +765,15 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -660,21 +785,24 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    body(*(segment0.begin() + i0),
-         *(segment1.begin() + i1),
-         i0, i1);
+    body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -691,17 +819,21 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     body(*(segment0.begin() + i0),
          *(segment1.begin() + i1),
          *(segment2.begin() + i2),
-         i0, i1, i2);
+         i0,
+         i1,
+         i2);
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -718,13 +850,15 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -740,22 +874,25 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
     if (i0 < len0 && i1 < len1) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           i0, i1);
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -777,18 +914,23 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
       body(*(segment0.begin() + i0),
            *(segment1.begin() + i1),
            *(segment2.begin() + i2),
-           i0, i1, i2);
+           i0,
+           i1,
+           i2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -807,13 +949,16 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -835,23 +980,27 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 
       for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1),
-             i0, i1);
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -882,7 +1031,9 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
           body(*(segment0.begin() + i0),
                *(segment1.begin() + i1),
                *(segment2.begin() + i2),
-               i0, i1, i2);
+               i0,
+               i1,
+               i2);
         }
       }
     }
@@ -893,27 +1044,30 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 /*
    CUDA generic flattened loop implementations
 */
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  sync,
-                                                  IndexMapper0>,
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<
+                       RAJA::iteration_mapping::DirectUnchecked,
+                       sync,
+                       IndexMapper0>,
                    SEGMENT>
-{};
+    : LoopExecute<RAJA::policy::cuda::cuda_indexer<
+                      RAJA::iteration_mapping::DirectUnchecked,
+                      sync,
+                      IndexMapper0>,
+                  SEGMENT> {
+};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<
+                       RAJA::iteration_mapping::DirectUnchecked,
+                       kernel_sync_requirement::none,
+                       IndexMapper0,
+                       IndexMapper1>,
+                   SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -924,23 +1078,27 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
     body(*(segment.begin() + i));
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<
+                       RAJA::iteration_mapping::DirectUnchecked,
+                       kernel_sync_requirement::none,
+                       IndexMapper0,
+                       IndexMapper1,
+                       IndexMapper2>,
+                   SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -953,33 +1111,35 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
     body(*(segment.begin() + i));
   }
 };
 
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
-{};
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             sync,
+                                             IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::cuda::
+              cuda_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
+          SEGMENT> {
+};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1>,
+    SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -992,7 +1152,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
     if (i < len) {
       body(*(segment.begin() + i));
@@ -1000,17 +1160,21 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1,
+                                             IndexMapper2>,
+    SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -1025,7 +1189,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
     if (i < len) {
       body(*(segment.begin() + i));
@@ -1033,27 +1197,33 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
   }
 };
 
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
-{};
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        sync,
+        IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::cuda::cuda_indexer<
+              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+              sync,
+              IndexMapper0>,
+          SEGMENT> {
+};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -1067,25 +1237,28 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*i1;
-         i < len;
-         i += i0_stride*i1_stride) {
+    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride) {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -1101,9 +1274,8 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
-         i < len;
-         i += i0_stride*i1_stride*i2_stride) {
+    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
+         i += i0_stride * i1_stride * i2_stride) {
       body(*(segment.begin() + i));
     }
   }
@@ -1114,12 +1286,14 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
    CUDA generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1128,20 +1302,23 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
       SEGMENT const &segment,
       BODY const &body)
   {
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
     body(segment.slice(i, static_cast<diff_t>(tile_size)));
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1152,23 +1329,30 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
       SEGMENT const &segment1,
       BODY const &body)
   {
-    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i0 = IndexMapper0::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size1);
 
     body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
          segment1.slice(i1, static_cast<diff_t>(tile_size1)));
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1181,9 +1365,12 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
       SEGMENT const &segment2,
       BODY const &body)
   {
-    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
-    const diff_t i2 = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+    const diff_t i0 = IndexMapper0::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size1);
+    const diff_t i2 = IndexMapper2::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size2);
 
     body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
          segment1.slice(i1, static_cast<diff_t>(tile_size1)),
@@ -1192,12 +1379,14 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1207,7 +1396,8 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
       BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
     if (i < len) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
@@ -1216,13 +1406,15 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1236,8 +1428,10 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
     const diff_t len0 = segment0.end() - segment0.begin();
     const diff_t len1 = segment1.end() - segment1.begin();
 
-    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i0 = IndexMapper0::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size1);
 
     if (i0 < len0 && i1 < len1) {
       body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
@@ -1246,15 +1440,20 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1271,9 +1470,12 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
     const diff_t len1 = segment1.end() - segment1.begin();
     const diff_t len2 = segment2.end() - segment2.begin();
 
-    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
-    const diff_t i2 = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+    const diff_t i0 = IndexMapper0::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size1);
+    const diff_t i2 = IndexMapper2::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size2);
 
     if (i0 < len0 && i1 < len1 && i2 < len2) {
       body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
@@ -1284,12 +1486,15 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1299,8 +1504,10 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
       BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
     for (diff_t i = i_init; i < len; i += i_stride) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
@@ -1309,13 +1516,16 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1329,11 +1539,15 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
     const diff_t len0 = segment0.end() - segment0.begin();
     const diff_t len1 = segment1.end() - segment1.begin();
 
-    const diff_t i0_init = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1_init = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i0_init = IndexMapper0::template index<diff_t>() *
+                           static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = IndexMapper1::template index<diff_t>() *
+                           static_cast<diff_t>(tile_size1);
 
-    const diff_t i0_stride = IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1_stride = IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i0_stride =
+        IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride =
+        IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
 
     for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
       for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
@@ -1344,15 +1558,21 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1369,13 +1589,19 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
     const diff_t len1 = segment1.end() - segment1.begin();
     const diff_t len2 = segment2.end() - segment2.begin();
 
-    const diff_t i0_init = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1_init = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
-    const diff_t i2_init = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+    const diff_t i0_init = IndexMapper0::template index<diff_t>() *
+                           static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = IndexMapper1::template index<diff_t>() *
+                           static_cast<diff_t>(tile_size1);
+    const diff_t i2_init = IndexMapper2::template index<diff_t>() *
+                           static_cast<diff_t>(tile_size2);
 
-    const diff_t i0_stride = IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1_stride = IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
-    const diff_t i2_stride = IndexMapper2::template size<diff_t>() * static_cast<diff_t>(tile_size2);
+    const diff_t i0_stride =
+        IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride =
+        IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i2_stride =
+        IndexMapper2::template size<diff_t>() * static_cast<diff_t>(tile_size2);
 
     for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
       for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
@@ -1394,12 +1620,14 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
    CUDA generic tile_tcount implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1416,13 +1644,15 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1441,19 +1671,25 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 
     body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
          segment1.slice(i1, static_cast<diff_t>(tile_size1)),
-         t0, t1);
+         t0,
+         t1);
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1477,17 +1713,21 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
          segment1.slice(i1, static_cast<diff_t>(tile_size1)),
          segment2.slice(i2, static_cast<diff_t>(tile_size2)),
-         t0, t1, t2);
+         t0,
+         t1,
+         t2);
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1507,13 +1747,15 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1536,20 +1778,26 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     if (i0 < len0 && i1 < len1) {
       body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
            segment1.slice(i1, static_cast<diff_t>(tile_size1)),
-           t0, t1);
+           t0,
+           t1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1578,18 +1826,23 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
       body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
            segment1.slice(i1, static_cast<diff_t>(tile_size1)),
            segment2.slice(i2, static_cast<diff_t>(tile_size2)),
-           t0, t1, t2);
+           t0,
+           t1,
+           t2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1611,13 +1864,16 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1643,25 +1899,34 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i0_stride = t0_stride * static_cast<diff_t>(tile_size0);
     const diff_t i1_stride = t1_stride * static_cast<diff_t>(tile_size1);
 
-    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) {
-      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) {
+    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0;
+         i0 += i0_stride, t0 += t0_stride) {
+      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1;
+           i1 += i1_stride, t1 += t1_stride) {
         body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
              segment1.slice(i1, static_cast<diff_t>(tile_size1)),
-             t0, t1);
+             t0,
+             t1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1694,13 +1959,18 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i1_stride = t1_stride * static_cast<diff_t>(tile_size1);
     const diff_t i2_stride = t2_stride * static_cast<diff_t>(tile_size2);
 
-    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) {
-      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) {
-        for (diff_t i2 = i2_init, t2 = t2_init; i2 < len2; i2 += i2_stride, t2 += t2_stride) {
+    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0;
+         i0 += i0_stride, t0 += t0_stride) {
+      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1;
+           i1 += i1_stride, t1 += t1_stride) {
+        for (diff_t i2 = i2_init, t2 = t2_init; i2 < len2;
+             i2 += i2_stride, t2 += t2_stride) {
           body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
                segment1.slice(i1, static_cast<diff_t>(tile_size1)),
                segment2.slice(i2, static_cast<diff_t>(tile_size2)),
-               t0, t1, t2);
+               t0,
+               t1,
+               t2);
         }
       }
     }
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
index f9f60f730e..42ba744da2 100644
--- a/include/RAJA/policy/cuda/multi_reduce.hpp
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -25,30 +25,28 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 
-#include <type_traits>
+#include <cuda.h>
+
 #include <limits>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
-#include <cuda.h>
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/math.hpp"
-#include "RAJA/util/mutex.hpp"
-#include "RAJA/util/types.hpp"
-#include "RAJA/util/reduce.hpp"
-#include "RAJA/util/OffsetOperators.hpp"
-
 #include "RAJA/pattern/detail/multi_reduce.hpp"
 #include "RAJA/pattern/multi_reduce.hpp"
-
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/policy/cuda/intrinsics.hpp"
+#include "RAJA/util/OffsetOperators.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/math.hpp"
+#include "RAJA/util/mutex.hpp"
+#include "RAJA/util/reduce.hpp"
+#include "RAJA/util/types.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -73,32 +71,40 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner, typename GetTallyIndex,
-          typename T, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                                                      T identity,
-                                                                      int bin,
-                                                                      T value,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template <typename Combiner,
+          typename GetTallyIndex,
+          typename T,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(
+    int RAJA_UNUSED_ARG(num_bins),
+    T identity,
+    int bin,
+    T value,
+    T* tally_mem,
+    GetTallyOffset get_tally_offset,
+    int tally_replication,
+    int tally_bins)
 {
-  if (value == identity) { return; }
+  if (value == identity) {
+    return;
+  }
 
-  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_index =
+      GetTallyIndex::template index<int>();  // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  int tally_offset =
+      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
   RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
-                                                           T identity,
-                                                           T* shared_mem,
-                                                           int shared_replication)
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(
+    int num_bins,
+    T identity,
+    T* shared_mem,
+    int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -113,60 +119,71 @@ RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
 }
 
 //! combine value into shared memory
-template <typename Combiner, typename GetSharedIndex,
-          typename T, typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                                                     T identity,
-                                                                     int bin,
-                                                                     T value,
-                                                                     T* shared_mem,
-                                                                     GetSharedOffset get_shared_offset,
-                                                                     int shared_replication)
+template <typename Combiner,
+          typename GetSharedIndex,
+          typename T,
+          typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(
+    int num_bins,
+    T identity,
+    int bin,
+    T value,
+    T* shared_mem,
+    GetSharedOffset get_shared_offset,
+    int shared_replication)
 {
-  if (value == identity) { return; }
+  if (value == identity) {
+    return;
+  }
 
-  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_index =
+      GetSharedIndex::template index<int>();  // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset =
+      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
   RAJA::reduce::cuda::atomic<Combiner>{}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
 template <typename Combiner,
-          typename T, typename GetSharedOffset, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                                                      T identity,
-                                                                      T* shared_mem,
-                                                                      GetSharedOffset get_shared_offset,
-                                                                      int shared_replication,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+          typename T,
+          typename GetSharedOffset,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(
+    int num_bins,
+    T identity,
+    T* shared_mem,
+    GetSharedOffset get_shared_offset,
+    int shared_replication,
+    T* tally_mem,
+    GetTallyOffset get_tally_offset,
+    int tally_replication,
+    int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                 (gridDim.x * gridDim.y) * blockIdx.z;
+                (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
   for (int bin = threadId; bin < num_bins; bin += numThreads) {
 
     T value = identity;
     for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
-      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      int shmem_offset =
+          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
       Combiner{}(value, shared_mem[shmem_offset]);
     }
 
     if (value != identity) {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      int tally_offset =
+          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
       RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
     }
-
   }
 }
 
@@ -182,30 +199,35 @@ RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bi
 
 //! MultiReduction data for Cuda Offload -- stores value, host pointer
 template <typename Combiner, typename T, typename tuning>
-struct MultiReduceGridAtomicHostInit_TallyData
-{
+struct MultiReduceGridAtomicHostInit_TallyData {
   //! setup permanent settings, allocate and initialize tally memory
-  template < typename Container >
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
-      : m_tally_mem(nullptr)
-      , m_identity(identity)
-      , m_num_bins(container.size())
-      , m_tally_bins(get_tally_bins(m_num_bins))
-      , m_tally_replication(get_tally_replication())
+  template <typename Container>
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
+      : m_tally_mem(nullptr),
+        m_identity(identity),
+        m_num_bins(container.size()),
+        m_tally_bins(get_tally_bins(m_num_bins)),
+        m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(
+        container, identity, m_num_bins, m_tally_bins, m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
   ~MultiReduceGridAtomicHostInit_TallyData() = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
-  template < typename Container >
+  template <typename Container>
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
@@ -214,19 +236,22 @@ struct MultiReduceGridAtomicHostInit_TallyData
       m_num_bins = new_num_bins;
       m_tally_bins = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+      m_tally_mem = create_tally(
+          container, identity, m_num_bins, m_tally_bins, m_tally_replication);
     } else {
       {
         int tally_rep = 0;
         int bin = 0;
         for (auto const& value : container) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+          m_tally_mem[GetTallyOffset{}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
           ++bin;
         }
       }
       for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
         for (int bin = 0; bin < m_num_bins; ++bin) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+          m_tally_mem[GetTallyOffset{}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
         }
       }
     }
@@ -244,9 +269,10 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-          reducer(m_identity);
+        reducer(m_identity);
     for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
-      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+      int tally_offset =
+          GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -258,20 +284,27 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(size_t(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-                                                       size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(
+      size_t(
+          policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+      size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size =
+      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
   using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using TallyAtomicReplicationConcretizer =
+      typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
+  using GetTallyOffset_rebind =
+      typename GetTallyOffset_rebind_rebunch::template rebunch<
+          s_tally_bunch_size>;
 
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
+           s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -285,35 +318,40 @@ struct MultiReduceGridAtomicHostInit_TallyData
       int func_min_global_replication;
     } func_data{min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}.template
-        get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer{}
+        .template get_global_replication<int>(func_data);
   }
 
-  template < typename Container >
-  static T* create_tally(Container const& container, T const& identity,
-                         int num_bins, int tally_bins, int tally_replication)
+  template <typename Container>
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
     if (num_bins == size_t(0)) {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication*tally_bins, s_tally_alignment);
+        tally_replication * tally_bins, s_tally_alignment);
 
     if (tally_replication > 0) {
       {
         int tally_rep = 0;
         int bin = 0;
         for (auto const& value : container) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(value);
+          int tally_offset =
+              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
       for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
         for (int bin = 0; bin < num_bins; ++bin) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(identity);
+          int tally_offset =
+              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -321,15 +359,20 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins, int tally_bins, int tally_replication)
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
     if (num_bins == size_t(0)) {
       return;
     }
 
-    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
+    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep) {
       for (int bin = num_bins; bin > 0; --bin) {
-        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+        int tally_offset = GetTallyOffset{}(bin - 1,
+                                            tally_bins,
+                                            tally_rep - 1,
+                                            tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -345,43 +388,40 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp threads
+  int m_tally_replication;  // power of 2, at least the max number of omp
+                            // threads
 };
 
 
 //! MultiReduction data for Cuda Offload -- stores value, host pointer
 template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
-    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
-{
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning> {
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::TallyData;
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::TallyData;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
-  { }
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
 
   //! teardown per launch, do nothing
-  void teardown_launch()
-  { }
+  void teardown_launch() {}
 
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device()
-  { }
+  void setup_device() {}
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device()
-  { }
+  void finalize_device() {}
 
 
   //! combine value on device, combine a value into the tally atomically
@@ -389,9 +429,14 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity,
-        bin, value,
-        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+        m_num_bins,
+        m_identity,
+        bin,
+        value,
+        m_tally_mem,
+        GetTallyOffset{},
+        m_tally_replication,
+        m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -401,7 +446,8 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    int tally_offset =
+        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner{}(m_tally_mem[tally_offset], value);
   }
 
@@ -409,10 +455,10 @@ struct MultiReduceGridAtomicHostInit_Data
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
@@ -420,32 +466,38 @@ struct MultiReduceGridAtomicHostInit_Data
 //! MultiReduction data for Cuda Offload -- stores value, host pointer
 template <typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
-    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
-{
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning> {
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template < typename Container >
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
-      : TallyData(container, identity)
-      , m_shared_offset(s_shared_offset_unknown)
-      , m_shared_replication(0)
-  { }
+  template <typename Container>
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
+      : TallyData(container, identity),
+        m_shared_offset(s_shared_offset_unknown),
+        m_shared_replication(0)
+  {
+  }
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
   ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
@@ -456,18 +508,18 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     }
 
     size_t shared_replication = 0;
-    const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size) {
-
-      struct {
-        size_t func_threads_per_block;
-        size_t func_max_shared_replication_per_block;
-      } func_data{block_size, max_shmem_size / m_num_bins};
-
-      shared_replication = SharedAtomicReplicationConcretizer{}.template
-          get_shared_replication<size_t>(func_data);
-      return m_num_bins * shared_replication;
-    });
+    const size_t shared_offset =
+        allocateDynamicShmem<T>([&](size_t max_shmem_size) {
+          struct {
+            size_t func_threads_per_block;
+            size_t func_max_shared_replication_per_block;
+          } func_data{block_size, max_shmem_size / m_num_bins};
+
+          shared_replication =
+              SharedAtomicReplicationConcretizer{}
+                  .template get_shared_replication<size_t>(func_data);
+          return m_num_bins * shared_replication;
+        });
 
     if (shared_offset != dynamic_smem_allocation_failure) {
       m_shared_replication = static_cast<int>(shared_replication);
@@ -491,9 +543,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   {
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr) {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity,
-          shared_mem, m_shared_replication);
+      impl::block_multi_reduce_init_shmem(m_num_bins,
+                                          m_identity,
+                                          shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -504,9 +557,15 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr) {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity,
-          shared_mem, GetSharedOffset{}, m_shared_replication,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins,
+          m_identity,
+          shared_mem,
+          GetSharedOffset{},
+          m_shared_replication,
+          m_tally_mem,
+          GetTallyOffset{},
+          m_tally_replication,
+          m_tally_bins);
     }
   }
 
@@ -518,14 +577,23 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr) {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          shared_mem, GetSharedOffset{}, m_shared_replication);
+          m_num_bins,
+          m_identity,
+          bin,
+          value,
+          shared_mem,
+          GetSharedOffset{},
+          m_shared_replication);
     } else {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins,
+          m_identity,
+          bin,
+          value,
+          m_tally_mem,
+          GetTallyOffset{},
+          m_tally_replication,
+          m_tally_bins);
     }
   }
 
@@ -536,13 +604,15 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    int tally_offset =
+        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner{}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
+  using SharedAtomicReplicationConcretizer =
+      typename shared_tuning::AtomicReplicationConcretizer;
   using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
@@ -551,18 +621,20 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown =
+      std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid =
+      std::numeric_limits<int>::max() - 1;
 
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset; // in bytes
-  int m_shared_replication; // power of 2
+  int m_shared_offset;       // in bytes
+  int m_shared_replication;  // power of 2
 
 
   RAJA_DEVICE
@@ -595,19 +667,28 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
-struct MultiReduceDataCuda
-{
-  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+template <typename T, typename t_MultiReduceOp, typename tuning>
+struct MultiReduceDataCuda {
+  static constexpr bool atomic_available =
+      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type =
-      std::conditional_t<(atomic_available),
-        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
-          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
-            cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-            void>>,
+  using reduce_data_type = std::conditional_t<
+      (atomic_available),
+      std::conditional_t<
+          (tuning::algorithm ==
+           multi_reduce_algorithm::
+               init_host_combine_block_atomic_then_grid_atomic),
+          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                            T,
+                                                            tuning>,
+          std::conditional_t<
+              (tuning::algorithm ==
+               multi_reduce_algorithm::init_host_combine_global_atomic),
+              cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                       T,
+                                                       tuning>,
+              void>>,
       void>;
 
 
@@ -619,13 +700,15 @@ struct MultiReduceDataCuda
 
   MultiReduceDataCuda() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* = nullptr >
+  template <
+      typename Container,
+      std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* =
+          nullptr>
   MultiReduceDataCuda(Container const& container, T identity)
-      : m_parent(this)
-      , m_sync_list(new SyncList)
-      , m_data(container, identity)
-      , m_own_launch_data(false)
+      : m_parent(this),
+        m_sync_list(new SyncList),
+        m_data(container, identity),
+        m_own_launch_data(false)
   {
   }
 
@@ -639,9 +722,10 @@ struct MultiReduceDataCuda
 #else
       : m_parent(&other)
 #endif
-      , m_sync_list(other.m_sync_list)
-      , m_data(other.m_data)
-      , m_own_launch_data(false)
+        ,
+        m_sync_list(other.m_sync_list),
+        m_data(other.m_data),
+        m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     if (m_parent) {
@@ -661,9 +745,9 @@ struct MultiReduceDataCuda
 #endif
   }
 
-  MultiReduceDataCuda(MultiReduceDataCuda &&) = delete;
+  MultiReduceDataCuda(MultiReduceDataCuda&&) = delete;
   MultiReduceDataCuda& operator=(MultiReduceDataCuda const&) = delete;
-  MultiReduceDataCuda& operator=(MultiReduceDataCuda &&) = delete;
+  MultiReduceDataCuda& operator=(MultiReduceDataCuda&&) = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -695,7 +779,7 @@ struct MultiReduceDataCuda
   }
 
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
@@ -729,7 +813,7 @@ struct MultiReduceDataCuda
 
 
 private:
-  MultiReduceDataCuda const *m_parent;
+  MultiReduceDataCuda const* m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
@@ -755,7 +839,8 @@ struct MultiReduceDataCuda
 
 }  // end namespace cuda
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy, cuda::MultiReduceDataCuda)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy,
+                                cuda::MultiReduceDataCuda)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp
index 4edf645ed3..c521db578e 100644
--- a/include/RAJA/policy/cuda/params/kernel_name.hpp
+++ b/include/RAJA/policy/cuda/params/kernel_name.hpp
@@ -4,45 +4,53 @@
 #if defined(RAJA_CUDA_ACTIVE)
 
 #include <cuda.h>
-#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
-#include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+#include "RAJA/pattern/params/kernel_name.hpp"
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(KernelName& kn, const RAJA::cuda::detail::cudaInfo &)
-  {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL> > init(
+    KernelName &kn,
+    const RAJA::cuda::detail::cudaInfo &)
+{
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-    nvtxRangePush(kn.name);
+  nvtxRangePush(kn.name);
 #else
-    RAJA_UNUSED_VAR(kn);
+  RAJA_UNUSED_VAR(kn);
 #endif
-  }
-
-  // Combine
-  template<typename EXEC_POL>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  combine(KernelName&) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(KernelName&, const RAJA::cuda::detail::cudaInfo &)
-  {
+}
+
+// Combine
+template <typename EXEC_POL>
+RAJA_HOST_DEVICE camp::concepts::enable_if<
+    type_traits::is_cuda_policy<EXEC_POL> >
+combine(KernelName &)
+{
+}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL> > resolve(
+    KernelName &,
+    const RAJA::cuda::detail::cudaInfo &)
+{
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-    nvtxRangePop();
+  nvtxRangePop();
 #endif
-  }
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index 6ab3372aaa..d87b7bd8f8 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -4,60 +4,65 @@
 #if defined(RAJA_CUDA_ACTIVE)
 
 #include <cuda.h>
+
+#include "RAJA/pattern/params/reducer.hpp"
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#include "RAJA/policy/cuda/policy.hpp"
 #include "RAJA/policy/cuda/reduce.hpp"
-#include "RAJA/pattern/params/reducer.hpp"
 
-#include "RAJA/policy/cuda/policy.hpp"
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL> > init(
+    Reducer<OP, T, VOp>& red,
+    RAJA::cuda::detail::cudaInfo& ci)
+{
+  red.devicetarget =
+      RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
+  red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
+  red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance()
+                         .template malloc<unsigned int>(1);
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+RAJA_HOST_DEVICE camp::concepts::enable_if<
+    type_traits::is_cuda_policy<EXEC_POL> >
+combine(Reducer<OP, T, VOp>& red)
+{
+  RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(
+      red.devicetarget, red.getVal(), red.device_mem, red.device_count);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL> > resolve(
+    Reducer<OP, T, VOp>& red,
+    RAJA::cuda::detail::cudaInfo& ci)
+{
+  // complete reduction
+  ci.res.wait();
+
+  red.combineTarget(*red.devicetarget);
+
+  // free memory
+  RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
+  red.device_count = nullptr;
+  red.device_mem.deallocate();
+  RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
+  red.devicetarget = nullptr;
+}
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
-  {
-    red.devicetarget = RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
-    red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& red)
-  {
-    RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(red.devicetarget,
-                                                                            red.getVal(),
-                                                                            red.device_mem,
-                                                                            red.device_count);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
-  {
-    // complete reduction
-    ci.res.wait();
-
-    red.combineTarget(*red.devicetarget);
-
-    // free memory
-    RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
-    red.device_count = nullptr;
-    red.device_mem.deallocate();
-    RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
-    red.devicetarget = nullptr;
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 4d7a7a2576..f6038b0c64 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -26,14 +26,12 @@
 #include <utility>
 
 #include "RAJA/pattern/reduce.hpp"
-
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/policy/sequential/policy.hpp"
-
-#include "RAJA/util/Operators.hpp"
 #include "RAJA/util/OffsetOperators.hpp"
-#include "RAJA/util/types.hpp"
+#include "RAJA/util/Operators.hpp"
 #include "RAJA/util/math.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -75,16 +73,16 @@ namespace cuda
 {
 
 /// Type representing thread and block indexing within a grid
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template<typename ...indexers>
+template <typename... indexers>
 struct IndexFlatten;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexDivide;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexModulo;
 
 
@@ -94,15 +92,15 @@ struct IndexModulo;
  * Note that the maximum occupancy of the kernel may be less than the maximum
  * occupancy of the device in terms of total threads.
  */
-struct MaxOccupancyConcretizer
-{
-  template < typename IdxT, typename Data >
+struct MaxOccupancyConcretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_sm_per_device = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -115,10 +113,9 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-struct FractionOffsetOccupancyConcretizer
-{
-  template < typename IdxT, typename Data >
+template <typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+struct FractionOffsetOccupancyConcretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
@@ -130,11 +127,14 @@ struct FractionOffsetOccupancyConcretizer
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
-      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
+        IdxT(0)) {
+      func_max_blocks_per_sm =
+          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -148,22 +148,23 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template < typename AvoidMaxOccupancyConcretizer >
-struct AvoidDeviceMaxThreadOccupancyConcretizer
-{
-  template < typename IdxT, typename Data >
+template <typename AvoidMaxOccupancyConcretizer>
+struct AvoidDeviceMaxThreadOccupancyConcretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm =
+        func_threads_per_block * func_max_blocks_per_sm;
 
     if (func_max_threads_per_sm < device_max_threads_per_sm) {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
     } else {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
+          data);
     }
   }
 };
@@ -172,10 +173,9 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template < size_t preferred_replication >
-struct ConstantPreferredReplicationConcretizer
-{
-  template < typename IdxT, typename Data >
+template <size_t preferred_replication>
+struct ConstantPreferredReplicationConcretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -187,11 +187,11 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
-                            size_t preferred_replication_after_cutoff >
-struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
-{
-  template < typename IdxT, typename Data >
+template <size_t t_cutoff,
+          size_t preferred_replication_before_cutoff,
+          size_t preferred_replication_after_cutoff>
+struct ThreadsPerBlockCutoffPreferredReplicationConcretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
     IdxT cutoff = t_cutoff;
@@ -210,19 +210,20 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
-struct SharedAtomicReplicationMaxPow2Concretizer
-{
-  template < typename IdxT, typename Data >
+template <typename GetPreferredReplication>
+struct SharedAtomicReplicationMaxPow2Concretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block =
+        data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+            data);
 
-    return prev_pow2(std::min(preferred_replication,
-                              func_max_shared_replication_per_block));
+    return prev_pow2(
+        std::min(preferred_replication, func_max_shared_replication_per_block));
   }
 };
 
@@ -231,39 +232,36 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
-struct GlobalAtomicReplicationMinPow2Concretizer
-{
-  template < typename IdxT, typename Data >
+template <typename GetPreferredReplication>
+struct GlobalAtomicReplicationMinPow2Concretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+            data);
 
-    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(
+        std::max(preferred_replication, func_min_global_replication));
   }
 };
 
 
-enum struct reduce_algorithm : int
-{
+enum struct reduce_algorithm : int {
   combine_last_block,
   init_device_combine_atomic_block,
   init_host_combine_atomic_block
 };
 
-enum struct block_communication_mode : int
-{
-  device_fence,
-  block_fence
-};
+enum struct block_communication_mode : int { device_fence, block_fence };
 
-template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
-           size_t t_replication, size_t t_atomic_stride >
-struct ReduceTuning
-{
+template <reduce_algorithm t_algorithm,
+          block_communication_mode t_comm_mode,
+          size_t t_replication,
+          size_t t_atomic_stride>
+struct ReduceTuning {
   static constexpr reduce_algorithm algorithm = t_algorithm;
   static constexpr block_communication_mode comm_mode = t_comm_mode;
   static constexpr size_t replication = t_replication;
@@ -273,27 +271,24 @@ struct ReduceTuning
 };
 
 
-enum struct multi_reduce_algorithm : int
-{
+enum struct multi_reduce_algorithm : int {
   init_host_combine_block_atomic_then_grid_atomic,
   init_host_combine_global_atomic
 };
 
-template < typename t_AtomicReplicationConcretizer,
-           typename t_ReplicationIndexer,
-           typename t_OffsetCalculator >
-struct AtomicReplicationTuning
-{
+template <typename t_AtomicReplicationConcretizer,
+          typename t_ReplicationIndexer,
+          typename t_OffsetCalculator>
+struct AtomicReplicationTuning {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
   using ReplicationIndexer = t_ReplicationIndexer;
   using OffsetCalculator = t_OffsetCalculator;
 };
 
-template < multi_reduce_algorithm t_algorithm,
-           typename t_SharedAtomicReplicationTuning,
-           typename t_GlobalAtomicReplicationTuning >
-struct MultiReduceTuning
-{
+template <multi_reduce_algorithm t_algorithm,
+          typename t_SharedAtomicReplicationTuning,
+          typename t_GlobalAtomicReplicationTuning>
+struct MultiReduceTuning {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
@@ -307,30 +302,34 @@ namespace policy
 namespace cuda
 {
 
-struct DeviceConstants
-{
+struct DeviceConstants {
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+  RAJA::Index_type
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
+                                             // the cache level that handles
+                                             // atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-    : WARP_SIZE(warp_size)
-    , MAX_BLOCK_SIZE(max_block_size)
-    , MAX_WARPS(max_block_size / warp_size)
-    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  { }
+      : WARP_SIZE(warp_size),
+        MAX_BLOCK_SIZE(max_block_size),
+        MAX_WARPS(max_block_size / warp_size),
+        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  {
+  }
 };
 
 //
 // Operations in the included files are parametrized using the following
 // values for CUDA warp size and max block size.
 //
-constexpr DeviceConstants device_constants(32, 1024, 32); // V100
+constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
@@ -339,37 +338,48 @@ constexpr const size_t MIN_BLOCKS_PER_SM = 1;
 constexpr const size_t MAX_BLOCKS_PER_SM = 32;
 
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct cuda_indexer {};
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct cuda_indexer {
+};
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct cuda_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
-  RAJA::Policy::cuda,
-  RAJA::Pattern::region,
-  detail::get_launch<true /*async */>::value,
-  RAJA::Platform::cuda> {
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct cuda_flatten_indexer
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::region,
+          detail::get_launch<true /*async */>::value,
+          RAJA::Platform::cuda> {
   using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template <typename _IterationMapping,
+          typename _IterationGetter,
+          typename _LaunchConcretizer,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::forall,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::cuda> {
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::forall,
+                                detail::get_launch<Async>::value,
+                                RAJA::Platform::cuda> {
   using IterationMapping = _IterationMapping;
   using IterationGetter = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
-template <bool Async, int num_threads = named_usage::unspecified,
+template <bool Async,
+          int num_threads = named_usage::unspecified,
           size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
-                                RAJA::Policy::cuda,
-                                RAJA::Pattern::region,
-                                detail::get_launch<Async>::value,
-                                RAJA::Platform::cuda> {
+struct cuda_launch_explicit_t
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::region,
+          detail::get_launch<Async>::value,
+          RAJA::Platform::cuda> {
 };
 
 
@@ -380,12 +390,14 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform
 ///
 /// WorkGroup execution policies
 ///
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async = false>
 struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::workgroup_exec,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::cuda> {
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::workgroup_exec,
+                                detail::get_launch<Async>::value,
+                                RAJA::Platform::cuda> {
 };
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
@@ -394,9 +406,9 @@ struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_cuda_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::workgroup_order,
-                       RAJA::Platform::cuda> {
+          RAJA::Policy::cuda,
+          RAJA::Pattern::workgroup_order,
+          RAJA::Platform::cuda> {
 };
 
 
@@ -408,36 +420,36 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template < typename tuning >
-struct cuda_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::cuda,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
+template <typename tuning>
+struct cuda_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::reduce,
+                                detail::get_launch<false>::value,
+                                RAJA::Platform::cuda,
+                                std::conditional_t<tuning::consistent,
+                                                   reduce::ordered,
+                                                   reduce::unordered>> {
 };
 
-template < typename tuning >
+template <typename tuning>
 struct cuda_multi_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::multi_reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::cuda,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::multi_reduce,
+          detail::get_launch<false>::value,
+          RAJA::Platform::cuda,
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>> {
 };
 
 /*!
  * Cuda atomic policy for using cuda atomics on the device and
  * the provided policy on the host
  */
-template<typename host_policy>
-struct cuda_atomic_explicit{};
+template <typename host_policy>
+struct cuda_atomic_explicit {
+};
 
 /*!
  * Default cuda atomic policy uses cuda atomics on the device and non-atomics
@@ -448,23 +460,26 @@ using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct cuda_block_reduce{};
+struct cuda_block_reduce {
+};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct cuda_warp_reduce{};
+struct cuda_warp_reduce {
+};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_direct{};
+struct cuda_warp_direct {
+};
 
 // Policy to map work to threads within a warp using a warp-stride loop
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_loop{};
-
+struct cuda_warp_loop {
+};
 
 
 // Policy to map work to threads within a warp using a bit mask
@@ -473,8 +488,9 @@ struct cuda_warp_loop{};
 // Since we are masking specific threads, multiple nested
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct cuda_warp_masked_direct {};
+template <typename Mask>
+struct cuda_warp_masked_direct {
+};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with cuda_thread_x_*
@@ -482,15 +498,18 @@ struct cuda_warp_masked_direct {};
 // Since we are masking specific threads, multiple nested
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct cuda_warp_masked_loop {};
+template <typename Mask>
+struct cuda_warp_masked_loop {
+};
 
 
-template<typename Mask>
-struct cuda_thread_masked_direct {};
+template <typename Mask>
+struct cuda_thread_masked_direct {
+};
 
-template<typename Mask>
-struct cuda_thread_masked_loop {};
+template <typename Mask>
+struct cuda_thread_masked_loop {
+};
 
 
 struct cuda_synchronize : make_policy_pattern_launch_t<Policy::cuda,
@@ -508,18 +527,16 @@ namespace internal
 RAJA_INLINE
 int get_size(cuda_dim_t dims)
 {
-  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
+  if (dims.x == 0 && dims.y == 0 && dims.z == 0) {
     return 0;
   }
-  return (dims.x ? dims.x : 1) *
-         (dims.y ? dims.y : 1) *
-         (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
 }
 
 struct CudaDims {
 
-  cuda_dim_t blocks{0,0,0};
-  cuda_dim_t threads{0,0,0};
+  cuda_dim_t blocks{0, 0, 0};
+  cuda_dim_t threads{0, 0, 0};
 
   CudaDims() = default;
   CudaDims(CudaDims const&) = default;
@@ -527,22 +544,20 @@ struct CudaDims {
 
   RAJA_INLINE
   CudaDims(cuda_dim_member_t default_val)
-    : blocks{default_val, default_val, default_val}
-    , threads{default_val, default_val, default_val}
-  { }
+      : blocks{default_val, default_val, default_val},
+        threads{default_val, default_val, default_val}
+  {
+  }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return get_size(blocks);
-  }
+  int num_blocks() const { return get_size(blocks); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return get_size(threads);
-  }
+  int num_threads() const { return get_size(threads); }
 
   RAJA_INLINE
-  cuda_dim_t get_blocks() const {
+  cuda_dim_t get_blocks() const
+  {
     if (num_blocks() != 0) {
       return {(blocks.x ? blocks.x : 1),
               (blocks.y ? blocks.y : 1),
@@ -553,7 +568,8 @@ struct CudaDims {
   }
 
   RAJA_INLINE
-  cuda_dim_t get_threads() const {
+  cuda_dim_t get_threads() const
+  {
     if (num_threads() != 0) {
       return {(threads.x ? threads.x : 1),
               (threads.y ? threads.y : 1),
@@ -564,101 +580,85 @@ struct CudaDims {
   }
 };
 
-template<named_dim dim>
+template <named_dim dim>
 struct CudaDimHelper;
 
-template<>
-struct CudaDimHelper<named_dim::x>{
+template <>
+struct CudaDimHelper<named_dim::x> {
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct CudaDimHelper<named_dim::y>{
+template <>
+struct CudaDimHelper<named_dim::y> {
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct CudaDimHelper<named_dim::z>{
+template <>
+struct CudaDimHelper<named_dim::z> {
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.z = value;
   }
 };
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-constexpr
-cuda_dim_member_t get_cuda_dim(dim_t const &d)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE constexpr cuda_dim_member_t get_cuda_dim(dim_t const& d)
 {
   return CudaDimHelper<dim>::get(d);
 }
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-void set_cuda_dim(dim_t &d, cuda_dim_member_t value)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE void set_cuda_dim(dim_t& d, cuda_dim_member_t value)
 {
   return CudaDimHelper<dim>::set(d, value);
 }
 
-} // namespace internal
+}  // namespace internal
 
 namespace cuda
 {
 
 /// specify block size and grid size for one dimension at runtime
-struct IndexSize
-{
+struct IndexSize {
   cuda_dim_member_t block_size = named_usage::unspecified;
   cuda_dim_member_t grid_size = named_usage::unspecified;
 
-  RAJA_HOST_DEVICE constexpr
-  IndexSize(cuda_dim_member_t _block_size = named_usage::unspecified,
-            cuda_dim_member_t _grid_size = named_usage::unspecified)
-    : block_size(_block_size)
-    , grid_size(_grid_size)
-  { }
+  RAJA_HOST_DEVICE constexpr IndexSize(
+      cuda_dim_member_t _block_size = named_usage::unspecified,
+      cuda_dim_member_t _grid_size = named_usage::unspecified)
+      : block_size(_block_size), grid_size(_grid_size)
+  {
+  }
 };
 
 /// Type representing thread indexing within a grid
@@ -666,436 +666,440 @@ struct IndexSize
 
 /// useful for global indexing
 /// with fixed block size and fixed grid size
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
-struct IndexGlobal
-{
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+struct IndexGlobal {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size of 1 and fixed grid size
-template<named_dim dim, int GRID_SIZE>
-struct IndexGlobal<dim, 1, GRID_SIZE>
-{
+template <named_dim dim, int GRID_SIZE>
+struct IndexGlobal<dim, 1, GRID_SIZE> {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim, int BLOCK_SIZE>
-struct IndexGlobal<dim, BLOCK_SIZE, 1>
-{
+template <named_dim dim, int BLOCK_SIZE>
+struct IndexGlobal<dim, BLOCK_SIZE, 1> {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim>
-struct IndexGlobal<dim, 1, 1>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, 1, 1> {
   static constexpr int block_size = 1;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 /// with dynamic block size and fixed grid size
-template<named_dim dim, int GRID_SIZE>
-struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
-{
+template <named_dim dim, int GRID_SIZE>
+struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE> {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size);
   }
 };
 /// with dynamic block size and fixed grid size of 1
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::unspecified, 1>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::unspecified, 1> {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
   }
 };
 
 /// with fixed block size and dynamic grid size
-template<named_dim dim, int BLOCK_SIZE>
-struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
-{
+template <named_dim dim, int BLOCK_SIZE>
+struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified> {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 /// with fixed block size of 1 and dynamic grid size
-template<named_dim dim>
-struct IndexGlobal<dim, 1, named_usage::unspecified>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, 1, named_usage::unspecified> {
   static constexpr int block_size = 1;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// with dynamic block size and dynamic grid size
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified> {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing blocks (ignores thread indices)
 /// with fixed grid size
-template<named_dim dim, int GRID_SIZE>
-struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
-{
+template <named_dim dim, int GRID_SIZE>
+struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE> {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed grid sized of 1
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::ignored, 1>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::ignored, 1> {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic grid size
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified> {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing threads (ignores block indices)
 /// with fixed block size
-template<named_dim dim, int BLOCK_SIZE>
-struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
-{
+template <named_dim dim, int BLOCK_SIZE>
+struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored> {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size of 1
-template<named_dim dim>
-struct IndexGlobal<dim, 1, named_usage::ignored>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, 1, named_usage::ignored> {
   static constexpr int block_size = 1;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic block size
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored> {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
   }
 };
 
 /// useful for doing single threaded sequential tasks
 /// (ignores thread and block indices)
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored> {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 // useful for flatten global index (includes x)
-template<typename x_index>
-struct IndexFlatten<x_index>
-{
+template <typename x_index>
+struct IndexFlatten<x_index> {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>();
+    return x_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y)
-template<typename x_index, typename y_index>
-struct IndexFlatten<x_index, y_index>
-{
+template <typename x_index, typename y_index>
+struct IndexFlatten<x_index, y_index> {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
-
+           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y,z)
-template<typename x_index, typename y_index, typename z_index>
-struct IndexFlatten<x_index, y_index, z_index>
-{
+template <typename x_index, typename y_index, typename z_index>
+struct IndexFlatten<x_index, y_index, z_index> {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
-                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
+           x_index::template size<IdxT>() *
+               (y_index::template index<IdxT>() +
+                y_index::template size<IdxT>() *
+                    z_index::template index<IdxT>());
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
+           z_index::template size<IdxT>();
   }
-
 };
 
-template<size_t divisor, typename indexer>
-struct IndexDivide
-{
-  template < typename IdxT = cuda_dim_member_t >
+template <size_t divisor, typename indexer>
+struct IndexDivide {
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
-template<size_t divisor, typename indexer>
-struct IndexModulo
-{
-  template < typename IdxT = cuda_dim_member_t >
+template <size_t divisor, typename indexer>
+struct IndexModulo {
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1104,121 +1108,115 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_thread;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
-struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
-{
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>> {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 };
 ///
 template <typename x_index, typename y_index, typename z_index>
-struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
-{
+struct get_index_thread<IndexFlatten<x_index, y_index, z_index>> {
   using type = IndexFlatten<typename get_index_thread<x_index>::type,
                             typename get_index_thread<y_index>::type,
                             typename get_index_thread<z_index>::type>;
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_block;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
-struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
-{
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>> {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 };
 ///
 template <typename x_index, typename y_index, typename z_index>
-struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
-{
+struct get_index_block<IndexFlatten<x_index, y_index, z_index>> {
   using type = IndexFlatten<typename get_index_block<x_index>::type,
                             typename get_index_block<y_index>::type,
                             typename get_index_block<z_index>::type>;
 };
 
 
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+template <size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template <size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
 template <size_t BLOCK_SIZE_X,
           size_t BLOCK_SIZE_Y,
           size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
-using warp_xyz = IndexDivide<WARP_SIZE,
-                             thread_xyz<BLOCK_SIZE_X,
-                                        BLOCK_SIZE_Y,
-                                        BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
-                                              BLOCK_SIZE_X,
-                                              BLOCK_SIZE_Y,
-                                              BLOCK_SIZE_Z>,
-                                     block_xyz<GRID_SIZE_X,
-                                               GRID_SIZE_Y,
-                                               GRID_SIZE_Z>>;
-
-} // namespace cuda
+template <size_t WARP_SIZE = RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE = RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified,
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+
+}  // namespace cuda
 
 // contretizers used in forall, scan, and sort policies
 
-using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer<cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using CudaAvoidDeviceMaxThreadOccupancyConcretizer =
+    cuda::AvoidDeviceMaxThreadOccupancyConcretizer<
+        cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template <typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+using CudaFractionOffsetOccupancyConcretizer =
+    cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
 
@@ -1228,179 +1226,286 @@ using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
 
 // policies usable with forall, scan, and sort
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+template <size_t BLOCK_SIZE,
+          size_t GRID_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+using cuda_exec_explicit =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+using cuda_exec_explicit_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+using cuda_exec =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+using cuda_exec_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename Fraction,
+          bool Async = false>
 using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction>
 using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
 using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename Concretizer,
+          bool Async = false>
 using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer>
 using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Concretizer>
 using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_base_explicit = std::conditional_t<with_reduce,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <bool with_reduce,
+          size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async = false>
+using cuda_exec_base_explicit = std::conditional_t<
+    with_reduce,
     cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_base_explicit_async = std::conditional_t<with_reduce,
+using cuda_exec_base_explicit_async = std::conditional_t<
+    with_reduce,
     cuda_exec_with_reduce_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
     cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_base = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce<BLOCK_SIZE, Async>,
-    cuda_exec<BLOCK_SIZE, Async>>;
+using cuda_exec_base =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce<BLOCK_SIZE, Async>,
+                       cuda_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using cuda_exec_base_async = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce_async<BLOCK_SIZE>,
-    cuda_exec_async<BLOCK_SIZE>>;
+using cuda_exec_base_async =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce_async<BLOCK_SIZE>,
+                       cuda_exec_async<BLOCK_SIZE>>;
 
 
 // policies usable with WorkGroup
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
-using cuda_work_explicit = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async = false>
+using cuda_work_explicit =
+    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_work_explicit_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_work_explicit_async =
+    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_work = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+using cuda_work = policy::cuda::
+    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_work_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+using cuda_work_async = policy::cuda::
+    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
 
@@ -1410,10 +1515,10 @@ using policy::cuda::cuda_atomic_explicit;
 
 
 // policies usable with reducers
-template < cuda::reduce_algorithm algorithm,
-           cuda::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
+template <cuda::reduce_algorithm algorithm,
+          cuda::block_communication_mode comm_mode,
+          size_t replication = named_usage::unspecified,
+          size_t atomic_stride = named_usage::unspecified>
 using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
     cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1436,35 +1541,41 @@ using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using cuda_reduce_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1476,25 +1587,26 @@ using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool with_atomic >
-using cuda_reduce_base = std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
+template <bool with_atomic>
+using cuda_reduce_base =
+    std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
 
 
 // policies usable with multi_reducers
-template < cuda::multi_reduce_algorithm algorithm,
-           typename SharedAtomicReplicationConcretizer,
-           typename SharedAtomicReplicationIndexer,
-           typename GlobalAtomicReplicationConcretizer,
-           typename GlobalAtomicReplicationIndexer >
-using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
-    cuda::MultiReduceTuning<
-      algorithm,
-      cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                    SharedAtomicReplicationIndexer,
-                                    GetOffsetRight<int>>,
-      cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                    GlobalAtomicReplicationIndexer,
-                                    GetOffsetLeft<int>>>>;
+template <cuda::multi_reduce_algorithm algorithm,
+          typename SharedAtomicReplicationConcretizer,
+          typename SharedAtomicReplicationIndexer,
+          typename GlobalAtomicReplicationConcretizer,
+          typename GlobalAtomicReplicationIndexer>
+using cuda_multi_reduce_tuning =
+    policy::cuda::cuda_multi_reduce_policy<cuda::MultiReduceTuning<
+        algorithm,
+        cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                      SharedAtomicReplicationIndexer,
+                                      GetOffsetRight<int>>,
+        cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                      GlobalAtomicReplicationIndexer,
+                                      GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1508,44 +1620,51 @@ using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the IBM power 9 + Nvidia V100 Sierra/Lassen
 //   systems.
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    cuda::SharedAtomicReplicationMaxPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<16>>,
-    cuda::thread_xyz<>,
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<2>>,
-    cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        cuda::SharedAtomicReplicationMaxPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<16>>,
+        cuda::thread_xyz<>,
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<2>>,
+        cuda::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    cuda::SharedAtomicReplicationMaxPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<0>>,
-    cuda::thread_xyz<>,
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<2>>,
-    cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        cuda::SharedAtomicReplicationMaxPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<0>>,
+        cuda::thread_xyz<>,
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<2>>,
+        cuda::warp_global_xyz<>>;
 //
 using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
     cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
+    void,  // unused with this algorithm
+    void,  // unused with this algorithm
     cuda::GlobalAtomicReplicationMinPow2Concretizer<
         cuda::ConstantPreferredReplicationConcretizer<2>>,
     cuda::warp_global_xyz<>>;
 //
-using cuda_multi_reduce_atomic_global_no_replication_host_init = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<1>>,
-    cuda::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using cuda_multi_reduce_atomic = cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely used
+using cuda_multi_reduce_atomic_global_no_replication_host_init =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
+        void,  // unused with this algorithm
+        void,  // unused with this algorithm
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<1>>,
+        cuda::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
+// the same answer every time when used in the same way
+using cuda_multi_reduce_atomic =
+    cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely
+// used
 using cuda_multi_reduce_atomic_low_performance_low_overhead =
     cuda_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1577,53 +1696,61 @@ using policy::cuda::cuda_thread_masked_loop;
 using policy::cuda::cuda_synchronize;
 
 // policies usable with launch
-template <bool Async, int num_threads = named_usage::unspecified, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_launch_explicit_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
+template <bool Async,
+          int num_threads = named_usage::unspecified,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_launch_explicit_t =
+    policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
 
-//CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
+// CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
 template <bool Async, int num_threads = named_usage::unspecified>
-using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads,
-    (num_threads == named_usage::unspecified) ? named_usage::unspecified : policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch_t =
+    policy::cuda::cuda_launch_explicit_t<Async,
+                                         num_threads,
+                                         (num_threads ==
+                                          named_usage::unspecified)
+                                             ? named_usage::unspecified
+                                             : policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
 // policies usable with kernel and launch
-template < typename ... indexers >
-using cuda_indexer_direct_unchecked = policy::cuda::cuda_indexer<
-    iteration_mapping::DirectUnchecked,
-    kernel_sync_requirement::none,
-    indexers...>;
-
-template < typename ... indexers >
-using cuda_indexer_direct = policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
-
-template < typename ... indexers >
+template <typename... indexers>
+using cuda_indexer_direct_unchecked =
+    policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
+                               kernel_sync_requirement::none,
+                               indexers...>;
+
+template <typename... indexers>
+using cuda_indexer_direct =
+    policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                               kernel_sync_requirement::none,
+                               indexers...>;
+
+template <typename... indexers>
 using cuda_indexer_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template < typename ... indexers >
-using cuda_flatten_indexer_direct_unchecked = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::DirectUnchecked,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using cuda_flatten_indexer_direct_unchecked =
+    policy::cuda::cuda_flatten_indexer<iteration_mapping::DirectUnchecked,
+                                       kernel_sync_requirement::none,
+                                       indexers...>;
 
-template < typename ... indexers >
-using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using cuda_flatten_indexer_direct =
+    policy::cuda::cuda_flatten_indexer<iteration_mapping::Direct,
+                                       kernel_sync_requirement::none,
+                                       indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1631,48 +1758,83 @@ using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
 
 
 // helper to generate the many policy aliases
-#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_POLICIES_HELPER(flatten, scope, mapping) \
-  \
-  using cuda_##flatten##scope##_x_##mapping = cuda_##flatten##scope##_##mapping<named_dim::x>; \
-  using cuda_##flatten##scope##_y_##mapping = cuda_##flatten##scope##_##mapping<named_dim::y>; \
-  using cuda_##flatten##scope##_z_##mapping = cuda_##flatten##scope##_##mapping<named_dim::z>; \
-  \
-  using cuda_##flatten##scope##_xy_##mapping = cuda_##flatten##scope##_##mapping<named_dim::x, named_dim::y>; \
-  using cuda_##flatten##scope##_xz_##mapping = cuda_##flatten##scope##_##mapping<named_dim::x, named_dim::z>; \
-  using cuda_##flatten##scope##_yx_##mapping = cuda_##flatten##scope##_##mapping<named_dim::y, named_dim::x>; \
-  using cuda_##flatten##scope##_yz_##mapping = cuda_##flatten##scope##_##mapping<named_dim::y, named_dim::z>; \
-  using cuda_##flatten##scope##_zx_##mapping = cuda_##flatten##scope##_##mapping<named_dim::z, named_dim::x>; \
-  using cuda_##flatten##scope##_zy_##mapping = cuda_##flatten##scope##_##mapping<named_dim::z, named_dim::y>; \
-  \
-  using cuda_##flatten##scope##_xyz_##mapping = cuda_##flatten##scope##_##mapping<named_dim::x, named_dim::y, named_dim::z>; \
-  using cuda_##flatten##scope##_xzy_##mapping = cuda_##flatten##scope##_##mapping<named_dim::x, named_dim::z, named_dim::y>; \
-  using cuda_##flatten##scope##_yxz_##mapping = cuda_##flatten##scope##_##mapping<named_dim::y, named_dim::x, named_dim::z>; \
-  using cuda_##flatten##scope##_yzx_##mapping = cuda_##flatten##scope##_##mapping<named_dim::y, named_dim::z, named_dim::x>; \
-  using cuda_##flatten##scope##_zxy_##mapping = cuda_##flatten##scope##_##mapping<named_dim::z, named_dim::x, named_dim::y>; \
-  using cuda_##flatten##scope##_zyx_##mapping = cuda_##flatten##scope##_##mapping<named_dim::z, named_dim::y, named_dim::x>;
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_POLICIES_HELPER(flatten,    \
+                                                         scope,      \
+                                                         mapping)    \
+                                                                     \
+  using cuda_##flatten##scope##_x_##mapping =                        \
+      cuda_##flatten##scope##_##mapping<named_dim::x>;               \
+  using cuda_##flatten##scope##_y_##mapping =                        \
+      cuda_##flatten##scope##_##mapping<named_dim::y>;               \
+  using cuda_##flatten##scope##_z_##mapping =                        \
+      cuda_##flatten##scope##_##mapping<named_dim::z>;               \
+                                                                     \
+  using cuda_##flatten##scope##_xy_##mapping =                       \
+      cuda_##flatten##scope##_##mapping<named_dim::x, named_dim::y>; \
+  using cuda_##flatten##scope##_xz_##mapping =                       \
+      cuda_##flatten##scope##_##mapping<named_dim::x, named_dim::z>; \
+  using cuda_##flatten##scope##_yx_##mapping =                       \
+      cuda_##flatten##scope##_##mapping<named_dim::y, named_dim::x>; \
+  using cuda_##flatten##scope##_yz_##mapping =                       \
+      cuda_##flatten##scope##_##mapping<named_dim::y, named_dim::z>; \
+  using cuda_##flatten##scope##_zx_##mapping =                       \
+      cuda_##flatten##scope##_##mapping<named_dim::z, named_dim::x>; \
+  using cuda_##flatten##scope##_zy_##mapping =                       \
+      cuda_##flatten##scope##_##mapping<named_dim::z, named_dim::y>; \
+                                                                     \
+  using cuda_##flatten##scope##_xyz_##mapping =                      \
+      cuda_##flatten##scope##_##mapping<named_dim::x,                \
+                                        named_dim::y,                \
+                                        named_dim::z>;               \
+  using cuda_##flatten##scope##_xzy_##mapping =                      \
+      cuda_##flatten##scope##_##mapping<named_dim::x,                \
+                                        named_dim::z,                \
+                                        named_dim::y>;               \
+  using cuda_##flatten##scope##_yxz_##mapping =                      \
+      cuda_##flatten##scope##_##mapping<named_dim::y,                \
+                                        named_dim::x,                \
+                                        named_dim::z>;               \
+  using cuda_##flatten##scope##_yzx_##mapping =                      \
+      cuda_##flatten##scope##_##mapping<named_dim::y,                \
+                                        named_dim::z,                \
+                                        named_dim::x>;               \
+  using cuda_##flatten##scope##_zxy_##mapping =                      \
+      cuda_##flatten##scope##_##mapping<named_dim::z,                \
+                                        named_dim::x,                \
+                                        named_dim::y>;               \
+  using cuda_##flatten##scope##_zyx_##mapping =                      \
+      cuda_##flatten##scope##_##mapping<named_dim::z,                \
+                                        named_dim::y,                \
+                                        named_dim::x>;
 
 // helper to generate the many thread policy aliases
-#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(flatten, mapping) \
-  template < named_dim ... dims > \
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(flatten, mapping)    \
+  template <named_dim... dims>                                                \
   using cuda_##flatten##thread_##mapping = cuda_##flatten##indexer_##mapping< \
-      cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>; \
-  \
+      cuda::IndexGlobal<dims,                                                 \
+                        named_usage::unspecified,                             \
+                        named_usage::ignored>...>;                            \
+                                                                              \
   RAJA_INTERNAL_CUDA_ALIAS_INDEXER_POLICIES_HELPER(flatten, thread, mapping)
 
 // helper to generate the many block policy aliases
-#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(flatten, mapping) \
-  template < named_dim ... dims > \
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_POLICIES(flatten, mapping)    \
+  template <named_dim... dims>                                               \
   using cuda_##flatten##block_##mapping = cuda_##flatten##indexer_##mapping< \
-      cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>; \
-  \
+      cuda::IndexGlobal<dims,                                                \
+                        named_usage::ignored,                                \
+                        named_usage::unspecified>...>;                       \
+                                                                             \
   RAJA_INTERNAL_CUDA_ALIAS_INDEXER_POLICIES_HELPER(flatten, block, mapping)
 
 // helper to generate the many global policy aliases
-#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(flatten, mapping) \
-  template < named_dim ... dims > \
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(flatten, mapping)    \
+  template <named_dim... dims>                                                \
   using cuda_##flatten##global_##mapping = cuda_##flatten##indexer_##mapping< \
-      cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>; \
-  \
+      cuda::IndexGlobal<dims,                                                 \
+                        named_usage::unspecified,                             \
+                        named_usage::unspecified>...>;                        \
+                                                                              \
   RAJA_INTERNAL_CUDA_ALIAS_INDEXER_POLICIES_HELPER(flatten, global, mapping)
 
 
@@ -1680,8 +1842,8 @@ using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
  * Maps segment indices to CUDA threads, blocks, or global threads.
  * This is the lowest overhead mapping, but requires that there are the same
  * number of physical threads, blocks, or global threads as map requests.
- * For example, a segment of size 1000 will only fit into 1000 threads, blocks, or global threads, and
- * triggers a runtime error in some cases.
+ * For example, a segment of size 1000 will only fit into 1000 threads, blocks,
+ * or global threads, and triggers a runtime error in some cases.
  */
 RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(, direct_unchecked)
 
@@ -1692,9 +1854,9 @@ RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(, direct_unchecked)
 /*!
  * Maps segment indices to CUDA threads, blocks, or global threads.
  * This is a low overhead mapping, but requires that there are enough
- * physical threads, blocks, or global threads to fit all of the direct map requests.
- * For example, a segment of size 2000 will not fit into 1024 threads, blocks,
- * or global threads, and triggers a runtime error in some cases.
+ * physical threads, blocks, or global threads to fit all of the direct map
+ * requests. For example, a segment of size 2000 will not fit into 1024 threads,
+ * blocks, or global threads, and triggers a runtime error in some cases.
  */
 RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_POLICIES(, direct)
 
@@ -1769,118 +1931,229 @@ RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, loop)
 
 
 // helper to generate the many one size policy aliases
-#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, scope, mapping) \
-  \
-  template < int X_SIZE > \
-  using cuda_##flatten##scope##_size_x_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>>; \
-  template < int Y_SIZE > \
-  using cuda_##flatten##scope##_size_y_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>>; \
-  template < int Z_SIZE > \
-  using cuda_##flatten##scope##_size_z_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>>; \
-  \
-  template < int X_SIZE, int Y_SIZE > \
-  using cuda_##flatten##scope##_size_xy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>, cuda::scope##_y<Y_SIZE>>; \
-  template < int X_SIZE, int Z_SIZE > \
-  using cuda_##flatten##scope##_size_xz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>, cuda::scope##_z<Z_SIZE>>; \
-  template < int Y_SIZE, int X_SIZE > \
-  using cuda_##flatten##scope##_size_yx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>, cuda::scope##_x<X_SIZE>>; \
-  template < int Y_SIZE, int Z_SIZE > \
-  using cuda_##flatten##scope##_size_yz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>, cuda::scope##_z<Z_SIZE>>; \
-  template < int Z_SIZE, int X_SIZE > \
-  using cuda_##flatten##scope##_size_zx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>, cuda::scope##_x<X_SIZE>>; \
-  template < int Z_SIZE, int Y_SIZE > \
-  using cuda_##flatten##scope##_size_zy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>, cuda::scope##_y<Y_SIZE>>; \
-  \
-  template < int X_SIZE, int Y_SIZE, int Z_SIZE > \
-  using cuda_##flatten##scope##_size_xyz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>, cuda::scope##_y<Y_SIZE>, cuda::scope##_z<Z_SIZE>>; \
-  template < int X_SIZE, int Z_SIZE, int Y_SIZE > \
-  using cuda_##flatten##scope##_size_xzy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>, cuda::scope##_z<Z_SIZE>, cuda::scope##_y<Y_SIZE>>; \
-  template < int Y_SIZE, int X_SIZE, int Z_SIZE > \
-  using cuda_##flatten##scope##_size_yxz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>, cuda::scope##_x<X_SIZE>, cuda::scope##_z<Z_SIZE>>; \
-  template < int Y_SIZE, int Z_SIZE, int X_SIZE > \
-  using cuda_##flatten##scope##_size_yzx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>, cuda::scope##_z<Z_SIZE>, cuda::scope##_x<X_SIZE>>; \
-  template < int Z_SIZE, int X_SIZE, int Y_SIZE > \
-  using cuda_##flatten##scope##_size_zxy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>, cuda::scope##_x<X_SIZE>, cuda::scope##_y<Y_SIZE>>; \
-  template < int Z_SIZE, int Y_SIZE, int X_SIZE > \
-  using cuda_##flatten##scope##_size_zyx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>, cuda::scope##_y<Y_SIZE>, cuda::scope##_x<X_SIZE>>;
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, \
+                                                                  scope,   \
+                                                                  mapping) \
+                                                                           \
+  template <int X_SIZE>                                                    \
+  using cuda_##flatten##scope##_size_x_##mapping =                         \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>>;          \
+  template <int Y_SIZE>                                                    \
+  using cuda_##flatten##scope##_size_y_##mapping =                         \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>>;          \
+  template <int Z_SIZE>                                                    \
+  using cuda_##flatten##scope##_size_z_##mapping =                         \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>>;          \
+                                                                           \
+  template <int X_SIZE, int Y_SIZE>                                        \
+  using cuda_##flatten##scope##_size_xy_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>,           \
+                                        cuda::scope##_y<Y_SIZE>>;          \
+  template <int X_SIZE, int Z_SIZE>                                        \
+  using cuda_##flatten##scope##_size_xz_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>,           \
+                                        cuda::scope##_z<Z_SIZE>>;          \
+  template <int Y_SIZE, int X_SIZE>                                        \
+  using cuda_##flatten##scope##_size_yx_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>,           \
+                                        cuda::scope##_x<X_SIZE>>;          \
+  template <int Y_SIZE, int Z_SIZE>                                        \
+  using cuda_##flatten##scope##_size_yz_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>,           \
+                                        cuda::scope##_z<Z_SIZE>>;          \
+  template <int Z_SIZE, int X_SIZE>                                        \
+  using cuda_##flatten##scope##_size_zx_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>,           \
+                                        cuda::scope##_x<X_SIZE>>;          \
+  template <int Z_SIZE, int Y_SIZE>                                        \
+  using cuda_##flatten##scope##_size_zy_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>,           \
+                                        cuda::scope##_y<Y_SIZE>>;          \
+                                                                           \
+  template <int X_SIZE, int Y_SIZE, int Z_SIZE>                            \
+  using cuda_##flatten##scope##_size_xyz_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>,           \
+                                        cuda::scope##_y<Y_SIZE>,           \
+                                        cuda::scope##_z<Z_SIZE>>;          \
+  template <int X_SIZE, int Z_SIZE, int Y_SIZE>                            \
+  using cuda_##flatten##scope##_size_xzy_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_SIZE>,           \
+                                        cuda::scope##_z<Z_SIZE>,           \
+                                        cuda::scope##_y<Y_SIZE>>;          \
+  template <int Y_SIZE, int X_SIZE, int Z_SIZE>                            \
+  using cuda_##flatten##scope##_size_yxz_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>,           \
+                                        cuda::scope##_x<X_SIZE>,           \
+                                        cuda::scope##_z<Z_SIZE>>;          \
+  template <int Y_SIZE, int Z_SIZE, int X_SIZE>                            \
+  using cuda_##flatten##scope##_size_yzx_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_SIZE>,           \
+                                        cuda::scope##_z<Z_SIZE>,           \
+                                        cuda::scope##_x<X_SIZE>>;          \
+  template <int Z_SIZE, int X_SIZE, int Y_SIZE>                            \
+  using cuda_##flatten##scope##_size_zxy_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>,           \
+                                        cuda::scope##_x<X_SIZE>,           \
+                                        cuda::scope##_y<Y_SIZE>>;          \
+  template <int Z_SIZE, int Y_SIZE, int X_SIZE>                            \
+  using cuda_##flatten##scope##_size_zyx_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_SIZE>,           \
+                                        cuda::scope##_y<Y_SIZE>,           \
+                                        cuda::scope##_x<X_SIZE>>;
 
 // helper to generate the many two size policy aliases
-#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten, scope, mapping) \
-  \
-  template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_x_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
-  template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_y_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
-  template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_z_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
-  \
-  template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, \
-             int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_xy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                       cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
-  template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, \
-             int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_xz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                       cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
-  template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, \
-             int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_yx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                       cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
-  template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, \
-             int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_yz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                       cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
-  template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, \
-             int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_zx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                       cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
-  template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, \
-             int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_zy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                       cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
-  \
-  template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, \
-             int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_xyz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                        cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                        cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
-  template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, \
-             int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_xzy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                        cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                        cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
-  template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE, \
-             int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_yxz_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                        cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                        cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
-  template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE, \
-             int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_yzx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                        cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                        cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
-  template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE, \
-             int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_zxy_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                        cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                        cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
-  template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE, \
-             int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
-  using cuda_##flatten##scope##_size_zyx_##mapping = cuda_##flatten##indexer_##mapping<cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                        cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                        cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten, \
+                                                                  scope,   \
+                                                                  mapping) \
+                                                                           \
+  template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>  \
+  using cuda_##flatten##scope##_size_x_##mapping =                         \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;                     \
+  template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>  \
+  using cuda_##flatten##scope##_size_y_##mapping =                         \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;                     \
+  template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>  \
+  using cuda_##flatten##scope##_size_z_##mapping =                         \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;                     \
+                                                                           \
+  template <int X_BLOCK_SIZE,                                              \
+            int Y_BLOCK_SIZE,                                              \
+            int X_GRID_SIZE = named_usage::unspecified,                    \
+            int Y_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_xy_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;                     \
+  template <int X_BLOCK_SIZE,                                              \
+            int Z_BLOCK_SIZE,                                              \
+            int X_GRID_SIZE = named_usage::unspecified,                    \
+            int Z_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_xz_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;                     \
+  template <int Y_BLOCK_SIZE,                                              \
+            int X_BLOCK_SIZE,                                              \
+            int Y_GRID_SIZE = named_usage::unspecified,                    \
+            int X_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_yx_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;                     \
+  template <int Y_BLOCK_SIZE,                                              \
+            int Z_BLOCK_SIZE,                                              \
+            int Y_GRID_SIZE = named_usage::unspecified,                    \
+            int Z_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_yz_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;                     \
+  template <int Z_BLOCK_SIZE,                                              \
+            int X_BLOCK_SIZE,                                              \
+            int Z_GRID_SIZE = named_usage::unspecified,                    \
+            int X_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_zx_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;                     \
+  template <int Z_BLOCK_SIZE,                                              \
+            int Y_BLOCK_SIZE,                                              \
+            int Z_GRID_SIZE = named_usage::unspecified,                    \
+            int Y_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_zy_##mapping =                        \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;                     \
+                                                                           \
+  template <int X_BLOCK_SIZE,                                              \
+            int Y_BLOCK_SIZE,                                              \
+            int Z_BLOCK_SIZE,                                              \
+            int X_GRID_SIZE = named_usage::unspecified,                    \
+            int Y_GRID_SIZE = named_usage::unspecified,                    \
+            int Z_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_xyz_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;                     \
+  template <int X_BLOCK_SIZE,                                              \
+            int Z_BLOCK_SIZE,                                              \
+            int Y_BLOCK_SIZE,                                              \
+            int X_GRID_SIZE = named_usage::unspecified,                    \
+            int Z_GRID_SIZE = named_usage::unspecified,                    \
+            int Y_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_xzy_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;                     \
+  template <int Y_BLOCK_SIZE,                                              \
+            int X_BLOCK_SIZE,                                              \
+            int Z_BLOCK_SIZE,                                              \
+            int Y_GRID_SIZE = named_usage::unspecified,                    \
+            int X_GRID_SIZE = named_usage::unspecified,                    \
+            int Z_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_yxz_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;                     \
+  template <int Y_BLOCK_SIZE,                                              \
+            int Z_BLOCK_SIZE,                                              \
+            int X_BLOCK_SIZE,                                              \
+            int Y_GRID_SIZE = named_usage::unspecified,                    \
+            int Z_GRID_SIZE = named_usage::unspecified,                    \
+            int X_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_yzx_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;                     \
+  template <int Z_BLOCK_SIZE,                                              \
+            int X_BLOCK_SIZE,                                              \
+            int Y_BLOCK_SIZE,                                              \
+            int Z_GRID_SIZE = named_usage::unspecified,                    \
+            int X_GRID_SIZE = named_usage::unspecified,                    \
+            int Y_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_zxy_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;                     \
+  template <int Z_BLOCK_SIZE,                                              \
+            int Y_BLOCK_SIZE,                                              \
+            int X_BLOCK_SIZE,                                              \
+            int Z_GRID_SIZE = named_usage::unspecified,                    \
+            int Y_GRID_SIZE = named_usage::unspecified,                    \
+            int X_GRID_SIZE = named_usage::unspecified>                    \
+  using cuda_##flatten##scope##_size_zyx_##mapping =                       \
+      cuda_##flatten##indexer_##mapping<                                   \
+          cuda::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          cuda::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          cuda::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 // helper to generate the many thread size policy aliases
-#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten, mapping) \
-    RAJA_INTERNAL_CUDA_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, thread, mapping)
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten, \
+                                                              mapping) \
+  RAJA_INTERNAL_CUDA_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten,   \
+                                                            thread,    \
+                                                            mapping)
 
 // helper to generate the many block size policy aliases
 #define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten, mapping) \
-    RAJA_INTERNAL_CUDA_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, block, mapping)
+  RAJA_INTERNAL_CUDA_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten,           \
+                                                            block,             \
+                                                            mapping)
 
 // helper to generate the many global size policy aliases
-#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten, mapping) \
-    RAJA_INTERNAL_CUDA_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten, global, mapping)
+#define RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten, \
+                                                              mapping) \
+  RAJA_INTERNAL_CUDA_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten,   \
+                                                            global,    \
+                                                            mapping)
 
 
 /*!
@@ -1924,11 +2197,13 @@ RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(, loop)
  * Reshapes multiple physical threads, blocks, or global threads into a 1D
  * iteration space.
  */
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_, direct_unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten_,
+                                                      direct_unchecked)
 
 RAJA_INTERNAL_CUDA_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten_, direct_unchecked)
 
-RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_, direct_unchecked)
+RAJA_INTERNAL_CUDA_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten_,
+                                                      direct_unchecked)
 
 /*
  * Maps segment indices to flattened CUDA threads, blocks, or global threads.
diff --git a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
index 409ec16818..973d2665cf 100644
--- a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
+++ b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
@@ -24,12 +24,12 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 
-#include <iostream>
-#include <string>
-
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+#include <iostream>
+#include <string>
+
 #include "RAJA/util/macros.hpp"
 
 namespace RAJA
@@ -64,8 +64,11 @@ inline void cudaAssert(cudaError_t code,
       msg += std::to_string(line);
       throw std::runtime_error(msg);
     } else {
-      fprintf(stderr, "CUDAassert: %s %s %d\n",
-              cudaGetErrorString(code), file, line);
+      fprintf(stderr,
+              "CUDAassert: %s %s %d\n",
+              cudaGetErrorString(code),
+              file,
+              line);
     }
   }
 }
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 2b13417531..d8af64fb6e 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -25,28 +25,26 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 
-#include <type_traits>
-
 #include <cuda.h>
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/SoAArray.hpp"
-#include "RAJA/util/SoAPtr.hpp"
-#include "RAJA/util/basic_mempool.hpp"
-#include "RAJA/util/mutex.hpp"
-#include "RAJA/util/types.hpp"
-#include "RAJA/util/reduce.hpp"
+#include <type_traits>
 
 #include "RAJA/pattern/detail/reduce.hpp"
 #include "RAJA/pattern/reduce.hpp"
-
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/policy/cuda/intrinsics.hpp"
+#include "RAJA/util/SoAArray.hpp"
+#include "RAJA/util/SoAPtr.hpp"
+#include "RAJA/util/basic_mempool.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/mutex.hpp"
+#include "RAJA/util/reduce.hpp"
+#include "RAJA/util/types.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -124,15 +122,19 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
-          typename T, typename TempIterator>
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T,
+          typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                        T identity,
-                                        TempIterator in_device_mem,
-                                        unsigned int* device_count)
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(
+      in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -147,7 +149,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   int maxNumSlots = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
   int beginSlots = replicationId * maxNumSlots;
@@ -169,8 +171,9 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
-    isLastBlock = (old_count == (numSlots-1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
+    isLastBlock = (old_count == (numSlots - 1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
@@ -181,9 +184,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId;
-                      i < numSlots;
-                      i += numThreads) {
+    for (unsigned int i = threadId; i < numSlots; i += numThreads) {
       Combiner{}(temp, device_mem.get(beginSlots + i));
     }
 
@@ -198,7 +199,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt {
+namespace expt
+{
 
 template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
@@ -214,7 +216,8 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE;
+         i *= 2) {
       T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
       temp = Combiner{}(temp, rhs);
     }
@@ -222,7 +225,8 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   } else {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE;
+         i *= 2) {
       int srcLane = threadId ^ i;
       T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
@@ -232,18 +236,25 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     }
   }
 
-  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <= RAJA::policy::cuda::device_constants.WARP_SIZE,
-               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
   if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE) {
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::
+            SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>*
+        sd = reinterpret_cast<RAJA::detail::SoAArray<
+            T,
+            RAJA::policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0) {
@@ -255,13 +266,15 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     if (warpNum == 0) {
 
       // read per warp values
-      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE <
+          numThreads) {
         temp = sd->get(warpId);
       } else {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS;
+           i *= 2) {
         T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
         temp = Combiner{}(temp, rhs);
       }
@@ -275,13 +288,16 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 
 template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
-                                          T val,
-                                          RAJA::detail::SoAPtr<T,RAJA::cuda::device_mempool_type> device_mem,
-                                          unsigned int* device_count)
+RAJA_DEVICE RAJA_INLINE void grid_reduce(
+    T* device_target,
+    T val,
+    RAJA::detail::SoAPtr<T, RAJA::cuda::device_mempool_type> device_mem,
+    unsigned int* device_count)
 {
-  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter =
+      typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter =
+      typename get_index_thread<GlobalIterationGetter>::type;
 
   const int numBlocks = BlockIterationGetter::size();
   const int numThreads = ThreadIterationGetter::size();
@@ -324,17 +340,21 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
   }
 }
 
-} //  namespace expt
+}  //  namespace expt
 
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
-                                               T identity,
-                                               T* device_mem,
-                                               unsigned int* device_count)
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(
+    T& val,
+    T identity,
+    T* device_mem,
+    unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -343,11 +363,11 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   if (numSlots <= 1u) {
     T temp = block_reduce<Combiner>(val, identity);
@@ -379,8 +399,9 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
     RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
-    isLastBlock = (old_count == (numSlots+1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
+    isLastBlock = (old_count == (numSlots + 1));
 
     // the last block for each replication gets the value from device_mem
     if (isLastBlock) {
@@ -395,8 +416,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
 RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
-                                                            T identity,
-                                                            T* device_mem)
+                                                          T identity,
+                                                          T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -404,8 +425,8 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
@@ -501,7 +522,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[num_slots] { return m_n->values; }
+    auto operator*() -> T (&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -538,7 +559,7 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Cuda res) -> T(&)[num_slots]
+  auto new_value(::RAJA::resources::Cuda res) -> T (&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
@@ -605,10 +626,12 @@ class PinnedTally
 
 //! Reduction data for Cuda Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
-struct ReduceLastBlock_Data
-{
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
+struct ReduceLastBlock_Data {
   using tally_mempool_type = pinned_mempool_type;
   using data_mempool_type = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
@@ -621,7 +644,7 @@ struct ReduceLastBlock_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool owns_device_pointer;
 
-  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){}
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {}
 
   /*! \brief create from a default value and offload information
    *
@@ -651,7 +674,7 @@ struct ReduceLastBlock_Data
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
     for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
@@ -665,9 +688,9 @@ struct ReduceLastBlock_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
+    size_t replicationId = impl::
+        grid_reduce_last_block<Combiner, Accessor, replication, atomic_stride>(
+            temp, identity, device, device_count);
     if (replicationId != replication) {
       output[replicationId] = temp;
     }
@@ -682,9 +705,10 @@ struct ReduceLastBlock_Data
       cuda_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots*replication);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device.allocate(maxNumSlots * replication);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -706,10 +730,11 @@ struct ReduceLastBlock_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename T,
-          size_t replication, size_t atomic_stride>
-struct ReduceAtomicHostInit_Data
-{
+template <typename Combiner,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
+struct ReduceAtomicHostInit_Data {
   using tally_mempool_type = device_pinned_mempool_type;
 
   static constexpr size_t tally_slots = replication * atomic_stride;
@@ -738,11 +763,12 @@ struct ReduceAtomicHostInit_Data
   {
   }
 
-  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) =
+      default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
     for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
@@ -756,9 +782,8 @@ struct ReduceAtomicHostInit_Data
   {
     T temp = value;
 
-    impl::grid_reduce_atomic_host_init<Combiner,
-        replication, atomic_stride>(
-            temp, identity, output);
+    impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
+        temp, identity, output);
   }
 
   //! check and setup for device
@@ -787,10 +812,12 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
-struct ReduceAtomicDeviceInit_Data
-{
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
+struct ReduceAtomicDeviceInit_Data {
   using tally_mempool_type = pinned_mempool_type;
   using data_mempool_type = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
@@ -824,11 +851,12 @@ struct ReduceAtomicDeviceInit_Data
   {
   }
 
-  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) =
+      default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
     for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
@@ -842,9 +870,11 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
+    size_t replicationId = impl::grid_reduce_atomic_device_init<Combiner,
+                                                                Accessor,
+                                                                replication,
+                                                                atomic_stride>(
+        temp, identity, device, device_count);
     if (replicationId != replication) {
       output[replicationId] = temp;
     }
@@ -856,9 +886,11 @@ struct ReduceAtomicDeviceInit_Data
   {
     bool act = !device && setupReducers();
     if (act) {
-      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device = data_mempool_type::getInstance().template malloc<T>(
+          replication * atomic_stride);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -885,41 +917,68 @@ struct ReduceAtomicDeviceInit_Data
 template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication = (tuning::replication > 0)
-      ? tuning::replication
-      : 1;
-  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
-      ? tuning::atomic_stride
-      : ((policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication =
+      (tuning::replication > 0) ? tuning::replication : 1;
+  static constexpr size_t atomic_stride =
+      (tuning::atomic_stride > 0)
+          ? tuning::atomic_stride
+          : ((policy::cuda::device_constants
+                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+                 ? RAJA_DIVIDE_CEILING_INT(
+                       policy::cuda::device_constants
+                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
+                       sizeof(T))
+                 : 1);
+
+  using Accessor = std::conditional_t<
+      (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseDeviceFence,
-        void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm ==
+       reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
-                                              (atomic_policy && !atomic_available),
-      cuda::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
-      std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
-          cuda::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
-            cuda::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
-            void>>,
-        void>>;
+  using reduce_data_type = std::conditional_t<
+      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
+          (atomic_policy && !atomic_available),
+      cuda::ReduceLastBlock_Data<Combiner,
+                                 Accessor,
+                                 T,
+                                 replication,
+                                 atomic_stride>,
+      std::conditional_t<
+          atomic_available,
+          std::conditional_t<
+              (tuning::algorithm ==
+               reduce_algorithm::init_device_combine_atomic_block),
+              cuda::ReduceAtomicDeviceInit_Data<Combiner,
+                                                Accessor,
+                                                T,
+                                                replication,
+                                                atomic_stride>,
+              std::conditional_t<
+                  (tuning::algorithm ==
+                   reduce_algorithm::init_host_combine_atomic_block),
+                  cuda::ReduceAtomicHostInit_Data<Combiner,
+                                                  T,
+                                                  replication,
+                                                  atomic_stride>,
+                  void>>,
+          void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
@@ -936,9 +995,7 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this},
-        tally_or_val_ptr{new TallyType},
-        val(init_val, identity_)
+      : parent{this}, tally_or_val_ptr{new TallyType}, val(init_val, identity_)
   {
   }
 
@@ -1137,9 +1194,10 @@ class ReduceMax<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 //! specialization of ReduceMinLoc for cuda_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          tuning>
+    : public cuda::Reduce<
+          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+          tuning>
 {
 
 public:
@@ -1150,20 +1208,26 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val, IndexType init_idx,
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
   {
   }
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1187,10 +1251,11 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
 //! specialization of ReduceMaxLoc for cuda_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::
-          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 tuning>
+    : public cuda::Reduce<
+          RAJA::reduce::max<
+              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+          tuning>
 {
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
@@ -1200,20 +1265,26 @@ class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val, IndexType init_idx,
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
   {
   }
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 0a9b0bf305..86db339c20 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -25,11 +25,10 @@
 #include <iterator>
 #include <type_traits>
 
-#include "cub/device/device_scan.cuh"
-#include "cub/util_allocator.cuh"
-
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/policy/cuda/policy.hpp"
+#include "cub/device/device_scan.cuh"
+#include "cub/util_allocator.cuh"
 
 namespace RAJA
 {
@@ -49,11 +48,13 @@ template <typename IterationMapping,
           bool Async,
           typename InputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-inclusive_inplace(
+RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive_inplace(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                             IterationGetter,
+                                             Concretizer,
+                                             BLOCKS_PER_SM,
+                                             Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -103,11 +104,13 @@ template <typename IterationMapping,
           typename InputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-exclusive_inplace(
+RAJA_INLINE resources::EventProxy<resources::Cuda> exclusive_inplace(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                             IterationGetter,
+                                             Concretizer,
+                                             BLOCKS_PER_SM,
+                                             Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -160,11 +163,13 @@ template <typename IterationMapping,
           typename InputIter,
           typename OutputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-inclusive(
+RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                             IterationGetter,
+                                             Concretizer,
+                                             BLOCKS_PER_SM,
+                                             Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -176,25 +181,15 @@ inclusive(
   // Determine temporary device storage requirements
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              len,
-                                              stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              len,
-                                              stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -216,11 +211,13 @@ template <typename IterationMapping,
           typename OutputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-exclusive(
+RAJA_INLINE resources::EventProxy<resources::Cuda> exclusive(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                             IterationGetter,
+                                             Concretizer,
+                                             BLOCKS_PER_SM,
+                                             Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index c5a353b704..cb1efab00a 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -26,13 +26,12 @@
 #include <iterator>
 #include <type_traits>
 
-#include "cub/device/device_radix_sort.cuh"
-
-#include "RAJA/util/concepts.hpp"
-#include "RAJA/util/Operators.hpp"
 #include "RAJA/pattern/detail/algorithm.hpp"
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/policy/cuda/policy.hpp"
+#include "RAJA/util/Operators.hpp"
+#include "RAJA/util/concepts.hpp"
+#include "cub/device/device_radix_sort.cuh"
 
 namespace RAJA
 {
@@ -44,32 +43,44 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter,
+       Iter,
+       Compare)
 {
-  static_assert (std::is_pointer<Iter>::value,
-      "stable_sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "stable_sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert (type_traits::is_arithmetic<iterval>::value,
-      "stable_sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<iterval>>,
-      camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "stable_sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<iterval>::value,
+                "stable_sort<cuda_exec> is only implemented for arithmetic "
+                "types");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "stable_sort<cuda_exec> is only implemented for "
+                "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -77,26 +88,32 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
   int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -134,7 +151,8 @@ stable(
   if (d_keys.Current() == d_out) {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        begin, d_out, len * sizeof(R), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -147,26 +165,32 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
   int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -204,7 +228,8 @@ stable(
   if (d_keys.Current() == d_out) {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        begin, d_out, len * sizeof(R), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -218,32 +243,43 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async>,
+         Iter,
+         Iter,
+         Compare)
 {
-  static_assert (std::is_pointer<Iter>::value,
-      "sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert (type_traits::is_arithmetic<iterval>::value,
-      "sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<iterval>>,
-      camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<iterval>::value,
+                "sort<cuda_exec> is only implemented for arithmetic types");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "sort<cuda_exec> is only implemented for RAJA::operators::less "
+                "or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -251,18 +287,24 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -270,18 +312,24 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -290,36 +338,52 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter,
+             KeyIter,
+             ValIter,
+             Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "stable_sort_pairs<cuda_exec> is only implemented for "
+                "pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "stable_sort_pairs<cuda_exec> is only implemented for "
+                "pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "stable_sort_pairs<cuda_exec> is only implemented for "
+                "arithmetic types");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for "
+      "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -327,20 +391,28 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::less<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -348,8 +420,8 @@ stable_pairs(
   using V = RAJA::detail::IterVal<ValIter>;
 
   int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -391,12 +463,14 @@ stable_pairs(
   if (d_keys.Current() == d_keys_out) {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), cudaMemcpyDefault, stream));
   }
   if (d_vals.Current() == d_vals_out) {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -410,20 +484,28 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::greater<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -431,8 +513,8 @@ stable_pairs(
   using V = RAJA::detail::IterVal<ValIter>;
 
   int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -474,12 +556,14 @@ stable_pairs(
   if (d_keys.Current() == d_keys_out) {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), cudaMemcpyDefault, stream));
   }
   if (d_vals.Current() == d_vals_out) {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -494,36 +578,50 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>,
+               KeyIter,
+               KeyIter,
+               ValIter,
+               Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
-      "sort_pairs<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "sort_pairs<cuda_exec> is only implemented for arithmetic "
+                "types");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or "
+      "RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -531,20 +629,28 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>> comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::less<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
@@ -552,20 +658,28 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index 71bf429079..c65eae2387 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -12,10 +12,8 @@
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
 
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/policy/atomic_builtin.hpp"
-
+#include "RAJA/util/macros.hpp"
 #include "desul/atomics.hpp"
 
 // Default desul options for RAJA
@@ -28,9 +26,7 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicLoad(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(AtomicPolicy, T *acc)
 {
   return desul::atomic_load(acc,
                             raja_default_desul_order{},
@@ -39,9 +35,7 @@ atomicLoad(AtomicPolicy, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void
-atomicStore(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(AtomicPolicy, T *acc, T value)
 {
   desul::atomic_store(acc,
                       value,
@@ -51,9 +45,7 @@ atomicStore(AtomicPolicy, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicAdd(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T *acc, T value)
 {
   return desul::atomic_fetch_add(acc,
                                  value,
@@ -63,9 +55,7 @@ atomicAdd(AtomicPolicy, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicSub(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T *acc, T value)
 {
   return desul::atomic_fetch_sub(acc,
                                  value,
@@ -75,8 +65,7 @@ atomicSub(AtomicPolicy, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T *acc, T value)
 {
   return desul::atomic_fetch_min(acc,
                                  value,
@@ -86,8 +75,7 @@ RAJA_INLINE T atomicMin(AtomicPolicy, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T *acc, T value)
 {
   return desul::atomic_fetch_max(acc,
                                  value,
@@ -97,8 +85,7 @@ RAJA_INLINE T atomicMax(AtomicPolicy, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T *acc)
 {
   return desul::atomic_fetch_inc(acc,
                                  raja_default_desul_order{},
@@ -107,8 +94,7 @@ RAJA_INLINE T atomicInc(AtomicPolicy, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T *acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
@@ -120,8 +106,7 @@ RAJA_INLINE T atomicInc(AtomicPolicy, T *acc, T val)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T *acc)
 {
   return desul::atomic_fetch_dec(acc,
                                  raja_default_desul_order{},
@@ -130,8 +115,7 @@ RAJA_INLINE T atomicDec(AtomicPolicy, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T *acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
@@ -143,8 +127,7 @@ RAJA_INLINE T atomicDec(AtomicPolicy, T *acc, T val)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T *acc, T value)
 {
   return desul::atomic_fetch_and(acc,
                                  value,
@@ -154,8 +137,7 @@ RAJA_INLINE T atomicAnd(AtomicPolicy, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T *acc, T value)
 {
   return desul::atomic_fetch_or(acc,
                                 value,
@@ -165,8 +147,7 @@ RAJA_INLINE T atomicOr(AtomicPolicy, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T *acc, T value)
 {
   return desul::atomic_fetch_xor(acc,
                                  value,
@@ -176,8 +157,7 @@ RAJA_INLINE T atomicXor(AtomicPolicy, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy, T *acc, T value)
 {
   return desul::atomic_exchange(acc,
                                 value,
@@ -187,8 +167,8 @@ RAJA_INLINE T atomicExchange(AtomicPolicy, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(AtomicPolicy, T *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T
+atomicCAS(AtomicPolicy, T *acc, T compare, T value)
 {
   return desul::atomic_compare_exchange(acc,
                                         compare,
@@ -200,4 +180,4 @@ RAJA_INLINE T atomicCAS(AtomicPolicy, T *acc, T compare, T value)
 }  // namespace RAJA
 
 #endif  // RAJA_ENABLE_DESUL_ATOMICS
-#endif // guard
+#endif  // guard
diff --git a/include/RAJA/policy/hip.hpp b/include/RAJA/policy/hip.hpp
index ab7e922c0f..b4bd6c99c5 100644
--- a/include/RAJA/policy/hip.hpp
+++ b/include/RAJA/policy/hip.hpp
@@ -30,16 +30,16 @@
 #include "RAJA/policy/hip/atomic.hpp"
 #endif
 
+#include "RAJA/policy/hip/WorkGroup.hpp"
 #include "RAJA/policy/hip/forall.hpp"
+#include "RAJA/policy/hip/kernel.hpp"
+#include "RAJA/policy/hip/launch.hpp"
+#include "RAJA/policy/hip/multi_reduce.hpp"
 #include "RAJA/policy/hip/policy.hpp"
 #include "RAJA/policy/hip/reduce.hpp"
-#include "RAJA/policy/hip/multi_reduce.hpp"
 #include "RAJA/policy/hip/scan.hpp"
 #include "RAJA/policy/hip/sort.hpp"
-#include "RAJA/policy/hip/kernel.hpp"
 #include "RAJA/policy/hip/synchronize.hpp"
-#include "RAJA/policy/hip/launch.hpp"
-#include "RAJA/policy/hip/WorkGroup.hpp"
 
 
 #endif  // closing endif for if defined(RAJA_HIP_ACTIVE)
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index f1f69eab5e..653235af9e 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -30,14 +30,13 @@
 #include <type_traits>
 #include <unordered_map>
 
+#include "RAJA/policy/hip/policy.hpp"
+#include "RAJA/policy/hip/raja_hiperrchk.hpp"
 #include "RAJA/util/basic_mempool.hpp"
-#include "RAJA/util/mutex.hpp"
-#include "RAJA/util/types.hpp"
 #include "RAJA/util/macros.hpp"
+#include "RAJA/util/mutex.hpp"
 #include "RAJA/util/resource.hpp"
-
-#include "RAJA/policy/hip/policy.hpp"
-#include "RAJA/policy/hip/raja_hiperrchk.hpp"
+#include "RAJA/util/types.hpp"
 
 #if defined(RAJA_ENABLE_ROCTX)
 #include "hip/hip_runtime_api.h"
@@ -78,8 +77,9 @@ struct PinnedAllocator {
   void* malloc(size_t nbytes)
   {
     void* ptr;
-    hipErrchk(hipHostMalloc(&ptr, nbytes,
-        hipHostMallocMapped | hipHostMallocNonCoherent));
+    hipErrchk(hipHostMalloc(&ptr,
+                            nbytes,
+                            hipHostMallocMapped | hipHostMallocNonCoherent));
     return ptr;
   }
 
@@ -155,7 +155,8 @@ struct DevicePinnedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type =
+    basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
@@ -167,7 +168,7 @@ struct hipInfo {
   hip_dim_t gridDim{0, 0, 0};
   hip_dim_t blockDim{0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0,0)};
+  ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0, 0)};
   bool setup_reducers = false;
 };
 struct hipStatusInfo : hipInfo {
@@ -187,10 +188,7 @@ extern hipStatusInfo tl_status;
 extern std::unordered_map<hipStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Hip res)
-{
-  res.wait();
-}
+void synchronize_impl(::RAJA::resources::Hip res) { res.wait(); }
 
 }  // namespace detail
 
@@ -251,18 +249,25 @@ void launch(::RAJA::resources::Hip res, bool async = true)
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func, hip_dim_t gridDim, hip_dim_t blockDim, void** args, size_t shmem,
-            ::RAJA::resources::Hip res, bool async = true, const char *name = nullptr)
+void launch(const void* func,
+            hip_dim_t gridDim,
+            hip_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Hip res,
+            bool async = true,
+            const char* name = nullptr)
 {
-  #if defined(RAJA_ENABLE_ROCTX)
-  if(name) roctxRangePush(name);
-  #else
-    RAJA_UNUSED_VAR(name);
-  #endif
-  hipErrchk(hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
-  #if defined(RAJA_ENABLE_ROCTX)
-  if(name) roctxRangePop();
-  #endif
+#if defined(RAJA_ENABLE_ROCTX)
+  if (name) roctxRangePush(name);
+#else
+  RAJA_UNUSED_VAR(name);
+#endif
+  hipErrchk(
+      hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+#if defined(RAJA_ENABLE_ROCTX)
+  if (name) roctxRangePop();
+#endif
   launch(res, async);
 }
 
@@ -280,9 +285,11 @@ hip_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-hip_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
-                                            detail::tl_status.gridDim.y *
-                                            detail::tl_status.gridDim.z; }
+hip_dim_member_t currentGridSize()
+{
+  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
+         detail::tl_status.gridDim.z;
+}
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -290,9 +297,11 @@ hip_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-hip_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
-                                             detail::tl_status.blockDim.y *
-                                             detail::tl_status.blockDim.z; }
+hip_dim_member_t currentBlockSize()
+{
+  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
+         detail::tl_status.blockDim.z;
+}
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -307,7 +316,8 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure =
+    std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -319,19 +329,19 @@ constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template < typename T, typename GetNFromMax >
-RAJA_INLINE
-size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+template <typename T, typename GetNFromMax>
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
   const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-      ? align - (unaligned_shmem % align)
-      : size_t(0);
+                                  ? align - (unaligned_shmem % align)
+                                  : size_t(0);
   const size_t aligned_shmem = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) *
-      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
+                                         max_shmem_bytes / sizeof(T));
 
   if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
@@ -359,7 +369,8 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
     ::RAJA::resources::Hip res,
     LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(detail::tl_status,
+  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(
+      detail::tl_status,
       detail::hipInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
@@ -372,10 +383,10 @@ static constexpr size_t hip_occupancy_uninitialized_size_t =
     std::numeric_limits<size_t>::max();
 
 //! Struct with the maximum theoretical occupancy of the device
-struct HipFixedMaxBlocksData
-{
+struct HipFixedMaxBlocksData {
   int device_sm_per_device = hip::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm = hip::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm =
+      hip::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -388,18 +399,17 @@ HipFixedMaxBlocksData hip_max_blocks()
 }
 
 //! Struct with the maximum occupancy of a kernel in simple terms
-struct HipOccMaxBlocksThreadsData
-{
+struct HipOccMaxBlocksThreadsData {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
   int func_max_blocks_per_device = hip_occupancy_uninitialized_int;
   int func_max_threads_per_block = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE HipOccMaxBlocksThreadsData
+hip_occupancy_max_blocks_threads(const void* func,
+                                 size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksThreadsData data;
 
@@ -408,33 +418,33 @@ HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
-    hipErrchk(hipOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
+    hipErrchk(
+        hipOccupancyMaxPotentialBlockSize(&data.func_max_blocks_per_device,
+                                          &data.func_max_threads_per_block,
+                                          func,
+                                          func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
     hipDeviceProp_t& prop = hip::device_prop();
     data.func_max_blocks_per_device = prop.multiProcessorCount;
     data.func_max_threads_per_block = 1024;
 #endif
-
   }
 
   return data;
 }
 
 //! Struct with the maximum occupancy of a kernel in specific terms
-struct HipOccMaxBlocksData : HipFixedMaxBlocksData
-{
+struct HipOccMaxBlocksData : HipFixedMaxBlocksData {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
   int func_threads_per_block = hip_occupancy_uninitialized_int;
   int func_max_blocks_per_sm = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
-RAJA_INLINE
-HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
+RAJA_INLINE HipOccMaxBlocksData
+hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
@@ -445,41 +455,52 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm,
+        func,
+        func_threads_per_block,
+        func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
+    data.func_max_blocks_per_sm =
+        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
+    if (data.func_max_blocks_per_sm <= 0) {
+      data.func_max_blocks_per_sm = 1
+    }
 #endif
-
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE HipOccMaxBlocksData
+hip_occupancy_max_blocks(const void* func,
+                         size_t func_dynamic_shmem_per_block,
+                         int func_threads_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
-  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-       data.func_threads_per_block != func_threads_per_block ) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+      data.func_threads_per_block != func_threads_per_block) {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
     data.func_threads_per_block = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm,
+        func,
+        func_threads_per_block,
+        func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
+    data.func_max_blocks_per_sm =
+        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
+    if (data.func_max_blocks_per_sm <= 0) {
+      data.func_max_blocks_per_sm = 1
+    }
 #endif
-
   }
 
   return data;
@@ -512,14 +533,16 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template < typename IdxT, typename Concretizer, typename UniqueMarker>
-struct ConcretizerImpl
-{
-  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
-    : m_func(func)
-    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
-    , m_len(len)
-  { }
+template <typename IdxT, typename Concretizer, typename UniqueMarker>
+struct ConcretizerImpl {
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
+      : m_func(func),
+        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
+        m_len(len)
+  {
+  }
 
   IdxT get_max_block_size() const
   {
@@ -533,7 +556,8 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     if (func_threads_per_block <= func_max_threads_per_block) {
       return func_threads_per_block;
     } else {
@@ -544,7 +568,8 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -552,26 +577,31 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const
   {
-    auto data = hip_occupancy_max_blocks<UniqueMarker>(
-        m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    auto data =
+        hip_occupancy_max_blocks<UniqueMarker>(m_func,
+                                               m_func_dynamic_shmem_per_block,
+                                               func_threads_per_block);
+    IdxT func_max_blocks_per_device =
+        Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -579,9 +609,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
 private:
diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
index 975d26b7ff..6efd7b51ac 100644
--- a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
@@ -18,16 +18,13 @@
 #ifndef RAJA_hip_WorkGroup_Dispatcher_HPP
 #define RAJA_hip_WorkGroup_Dispatcher_HPP
 
-#include "RAJA/config.hpp"
-
-#include "camp/resource.hpp"
-
-#include "RAJA/policy/hip/policy.hpp"
+#include <mutex>
+#include <thread>
 
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
-
-#include <thread>
-#include <mutex>
+#include "RAJA/policy/hip/policy.hpp"
+#include "camp/resource.hpp"
 
 
 namespace RAJA
@@ -41,9 +38,9 @@ namespace hip
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template < typename Factory >
-__global__ void get_value_global(
-    typename Factory::value_type* ptr, Factory factory)
+template <typename Factory>
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -73,7 +70,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -81,8 +78,9 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Hip::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void *args[] = {(void*)&ptr, (void*)&factory};
+  auto func =
+      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void* args[] = {(void*)&ptr, (void*)&factory};
   hipErrchk(hipLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   hipErrchk(hipStreamSynchronize(res.get_stream()));
 
@@ -91,7 +89,7 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -101,17 +99,15 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace hip
 
 /*!
-* Populate and return a Dispatcher object that can be used in device code
-*/
-template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async >
+ * Populate and return a Dispatcher object that can be used in device code
+ */
+template <typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async>
 inline const Dispatcher_T* get_Dispatcher(hip_work<BLOCK_SIZE, Async> const&)
 {
   static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return hip::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+        return hip::get_cached_value(std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index 26d45d7bd9..91d57f6d3b 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -19,11 +19,9 @@
 #define RAJA_hip_WorkGroup_WorkRunner_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/policy/hip/policy.hpp"
-#include "RAJA/policy/hip/MemUtils_HIP.hpp"
-
 #include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+#include "RAJA/policy/hip/MemUtils_HIP.hpp"
+#include "RAJA/policy/hip/policy.hpp"
 
 
 namespace RAJA
@@ -36,35 +34,32 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
-  using base = WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+          typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...> {
+  using base = WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
   using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -73,9 +68,10 @@ struct WorkRunner<
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -84,7 +80,9 @@ struct WorkRunner<
 
     // Only synchronize if we had something to iterate over
     if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
+      if (!Async) {
+        RAJA::hip::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -95,35 +93,32 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
-  using base = WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+          typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...> {
+  using base = WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::reverse_ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
   using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -132,9 +127,10 @@ struct WorkRunner<
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -143,7 +139,9 @@ struct WorkRunner<
 
     // Only synchronize if we had something to iterate over
     if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
+      if (!Async) {
+        RAJA::hip::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -155,26 +153,28 @@ struct WorkRunner<
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
-struct HoldHipDeviceXThreadblockLoop
-{
-  template < typename segment_in, typename body_in >
+template <typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
+struct HoldHipDeviceXThreadblockLoop {
+  template <typename segment_in, typename body_in>
   HoldHipDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {
+  }
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
     // TODO:: decide when to run hooks, may bypass this and use impl directly
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
-    const index_type stride  = blockDim.x * gridDim.x;
+    const index_type stride = blockDim.x * gridDim.x;
     const auto begin = m_segment.begin();
-    const auto end   = m_segment.end();
+    const auto end = m_segment.end();
     const index_type len(end - begin);
-    for ( index_type i = i_begin; i < len; i += stride ) {
+    for (index_type i = i_begin; i < len; i += stride) {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -184,11 +184,11 @@ struct HoldHipDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template < size_t BLOCK_SIZE,
-           typename StorageIter,
-           typename value_type,
-           typename index_type,
-           typename ... Args >
+template <size_t BLOCK_SIZE,
+          typename StorageIter,
+          typename value_type,
+          typename index_type,
+          typename... Args>
 __launch_bounds__(BLOCK_SIZE, 1) __global__
     void hip_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -205,21 +205,22 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...> {
   using exec_policy = RAJA::hip_work<BLOCK_SIZE, Async>;
-  using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
+  using order_policy =
+      RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
   using Allocator = ALLOCATOR_T;
   using index_type = INDEX_T;
@@ -227,14 +228,15 @@ struct WorkRunner<
 
   // The type that will hold the segment and loop body in work storage
   struct holder_type {
-    template < typename T >
+    template <typename T>
     using type = HoldHipDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type, // ITERABLE
-        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
-        index_type, Args...>;
+        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
+        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
+        index_type,
+        Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -243,21 +245,24 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::hip, dispatcher_holder_policy, RAJA::hip_work<BLOCK_SIZE, true>, Args...>;
+  using dispatcher_type = Dispatcher<Platform::hip,
+                                     dispatcher_holder_policy,
+                                     RAJA::hip_work<BLOCK_SIZE, true>,
+                                     Args...>;
 
   WorkRunner() = default;
 
   WorkRunner(WorkRunner const&) = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner && o)
+  WorkRunner& operator=(WorkRunner&& o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -267,17 +272,21 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template <typename WorkContainer, typename Iterable, typename LoopBody>
+  inline void enqueue(WorkContainer& storage,
+                      Iterable&& iter,
+                      LoopBody&& loop_body)
   {
-    using Iterator  = camp::decay<decltype(std::begin(iter))>;
+    using Iterator = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
-    using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using ITERABLE = camp::decay<Iterable>;
+    using IndexType =
+        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template
+    // true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
     Iterator end = std::end(iter);
@@ -289,30 +298,40 @@ struct WorkRunner<
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup
+      // reductions
       //
       // LOOP_BODY body = RAJA::hip::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream,
+      //     std::forward<LoopBody>(loop_body));
 
-      storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
-          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+      storage.template emplace<holder>(get_Dispatcher<holder, dispatcher_type>(
+                                           dispatcher_exec_policy{}),
+                                       std::forward<Iterable>(iter),
+                                       std::forward<LoopBody>(loop_body));
     }
   }
 
   // no extra storage required here
   using per_run_storage = int;
 
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  template <typename WorkContainer>
+  per_run_storage run(WorkContainer const& storage,
+                      resource_type r,
+                      Args... args) const
   {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using Iterator = camp::decay<decltype(std::begin(storage))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(storage),
+                                                         std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
     per_run_storage run_storage{};
 
-    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
+    auto func = hip_unordered_y_block_global<BLOCK_SIZE,
+                                             Iterator,
+                                             value_type,
+                                             index_type,
+                                             Args...>;
 
     //
     // Compute the requested iteration space size
@@ -324,16 +343,19 @@ struct WorkRunner<
     // Only launch kernel if we have something to iterate over
     if (num_loops > 0 && BLOCK_SIZE > 0) {
 
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations =
+          m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
       hip_dim_t blockSize{static_cast<hip_dim_member_t>(block_size), 1, 1};
-      hip_dim_t gridSize{static_cast<hip_dim_member_t>((average_iterations + block_size - 1) / block_size),
-                          static_cast<hip_dim_member_t>(num_loops),
-                          1};
+      hip_dim_t gridSize{static_cast<hip_dim_member_t>(
+                             (average_iterations + block_size - 1) /
+                             block_size),
+                         static_cast<hip_dim_member_t>(num_loops),
+                         1};
 
       RAJA_FT_BEGIN;
 
@@ -346,8 +368,9 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        void* func_args[] = {(void*)&begin, (void*)&args...};
+        RAJA::hip::launch(
+            (const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -357,10 +380,7 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
+  void clear() { m_total_iterations = 0; }
 
 private:
   index_type m_total_iterations = 0;
@@ -369,29 +389,31 @@ struct WorkRunner<
 #if !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported runner types incomplete
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::indirect_function_call_dispatch,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    RAJA::indirect_function_call_dispatch,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>;
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::indirect_virtual_function_dispatch,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    RAJA::indirect_virtual_function_dispatch,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>;
 
 #endif
 
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index b4f0d7faa7..e6fe42c63e 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -25,12 +25,11 @@
 #include <cstdint>
 #include <stdexcept>
 #include <type_traits>
-#include "hip/hip_runtime.h"
-
-#include "camp/list.hpp"
 
-#include "RAJA/policy/sequential/atomic.hpp"
 #include "RAJA/policy/atomic_builtin.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
+#include "camp/list.hpp"
+#include "hip/hip_runtime.h"
 #if defined(RAJA_ENABLE_OPENMP)
 #include "RAJA/policy/openmp/atomic.hpp"
 #endif
@@ -49,11 +48,8 @@ namespace RAJA
 namespace detail
 {
 
-using hip_atomicCommon_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long
->;
+using hip_atomicCommon_builtin_types =
+    ::camp::list<int, unsigned int, unsigned long long>;
 
 /*!
  * Type trait for determining if atomic operators should be implemented
@@ -63,10 +59,9 @@ using hip_atomicCommon_builtin_types = ::camp::list<
  */
 template <typename T>
 struct hip_useBuiltinCommon {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value;
 };
 
 
@@ -79,14 +74,13 @@ struct hip_useBuiltinCommon {
  */
 template <typename T>
 struct hip_useReinterpretCommon {
-  static constexpr bool value =
-    !hip_useBuiltinCommon<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+  static constexpr bool value = !hip_useBuiltinCommon<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 
@@ -118,11 +112,10 @@ RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
  */
 template <typename T>
 struct hip_useBuiltinExchange {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value ||
-    std::is_same<T, float>::value;
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value ||
+                                std::is_same<T, float>::value;
 };
 
 /*!
@@ -131,21 +124,21 @@ struct hip_useBuiltinExchange {
  */
 template <typename T>
 struct hip_useReinterpretExchange {
-  static constexpr bool value =
-    !hip_useBuiltinExchange<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+  static constexpr bool value = !hip_useBuiltinExchange<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
 template <typename T>
-using hip_useReinterpretExchange_t = typename hip_useReinterpretExchange<T>::type;
+using hip_useReinterpretExchange_t =
+    typename hip_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
@@ -169,8 +162,8 @@ RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
   using R = hip_useReinterpretExchange_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicExchange(reinterpret_cast<R*>(acc),
-                       RAJA::util::reinterp_A_as_B<T, R>(value)));
+      hip_atomicExchange(reinterpret_cast<R *>(acc),
+                         RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -184,8 +177,8 @@ RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
 template <typename T>
 struct hip_useBuiltinLoad {
   static constexpr bool value =
-    (std::is_integral<T>::value || std::is_enum<T>::value) &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      (std::is_integral<T>::value || std::is_enum<T>::value) &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 template <typename T>
@@ -198,53 +191,52 @@ using hip_useBuiltinStore = hip_useBuiltinLoad<T>;
  */
 template <typename T>
 struct hip_useReinterpretLoad {
-  static constexpr bool value =
-    !std::is_integral<T>::value &&
-    !std::is_enum<T>::value &&
-    ((sizeof(T) == 1
+  static constexpr bool value = !std::is_integral<T>::value &&
+                                !std::is_enum<T>::value &&
+                                ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-      && sizeof(unsigned char) == 1
+                                  && sizeof(unsigned char) == 1
 #endif
-     ) ||
-     (sizeof(T) == 2
+                                  ) ||
+                                 (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-      && sizeof(unsigned short) == 2
+                                  && sizeof(unsigned short) == 2
 #endif
-     ) ||
-     (sizeof(T) == 4
+                                  ) ||
+                                 (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-      && sizeof(unsigned int) == 4
+                                  && sizeof(unsigned int) == 4
 #endif
-     ) ||
-     (sizeof(T) == 8
+                                  ) ||
+                                 (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-      && sizeof(unsigned long long) == 8
+                                  && sizeof(unsigned long long) == 8
 #endif
-     ));
+                                  ));
 
   using type =
-    std::conditional_t<sizeof(T) == 1,
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                       uint8_t,
+                         uint8_t,
 #else
-                       unsigned char,
+                         unsigned char,
 #endif
-    std::conditional_t<sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                       uint16_t,
+                                            uint16_t,
 #else
-                       unsigned short,
+                                            unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                       uint32_t,
+                                                               uint32_t,
 #else
-                       unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                       uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-                       unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
@@ -301,7 +293,7 @@ RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
   using R = hip_useReinterpretLoad_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicLoad(reinterpret_cast<R*>(acc)));
+      hip_atomicLoad(reinterpret_cast<R *>(acc)));
 }
 
 
@@ -325,7 +317,7 @@ RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
 {
   using R = hip_useReinterpretStore_t<T>;
 
-  hip_atomicStore(reinterpret_cast<R*>(acc),
+  hip_atomicStore(reinterpret_cast<R *>(acc),
                   RAJA::util::reinterp_A_as_B<T, R>(value));
 }
 
@@ -354,9 +346,9 @@ RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
   using R = hip_useReinterpretCommon_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicCAS(reinterpret_cast<R*>(acc),
-                  RAJA::util::reinterp_A_as_B<T, R>(compare),
-                  RAJA::util::reinterp_A_as_B<T, R>(value)));
+      hip_atomicCAS(reinterpret_cast<R *>(acc),
+                    RAJA::util::reinterp_A_as_B<T, R>(compare),
+                    RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -367,14 +359,14 @@ RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
+RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T &a, const T &b)
 {
   return a == b;
 }
 
 template <typename T,
           std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
+RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T &a, const T &b)
 {
   using R = hip_useReinterpretCommon_t<T>;
 
@@ -390,8 +382,7 @@ RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
  * operation.
  */
 template <typename T, typename Oper>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
-                                            Oper&& oper)
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc, Oper &&oper)
 {
   T old = hip_atomicLoad(acc);
   T expected;
@@ -406,15 +397,15 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
 
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
- * Implementation uses the existing HIP supplied unsigned 32-bit or 64-bit CAS
- * operator. Returns the OLD value that was replaced by the result of this
- * operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with
+ * short-circuiting. Implementation uses the existing HIP supplied unsigned
+ * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
+ * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
 RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
-                                            Oper&& oper,
-                                            ShortCircuit&& sc)
+                                            Oper &&oper,
+                                            ShortCircuit &&sc)
 {
   T old = hip_atomicLoad(acc);
 
@@ -440,28 +431,27 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
 /*!
  * List of types where HIP builtin atomics are used to implement atomicAdd.
  */
-using hip_atomicAdd_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long,
-  float
+using hip_atomicAdd_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                 ,
+                                                 double
 #endif
->;
+                                                 >;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types> * = nullptr>
 RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
 {
   return ::atomicAdd(acc, value);
@@ -475,16 +465,15 @@ RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
 /*!
  * List of types where HIP builtin atomics are used to implement atomicSub.
  */
-using hip_atomicSub_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long,
-  float
+using hip_atomicSub_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                 ,
+                                                 double
 #endif
->;
+                                                 >;
 
 /*!
  * List of types where HIP builtin atomicSub is used to implement atomicSub.
@@ -492,10 +481,7 @@ using hip_atomicSub_builtin_types = ::camp::list<
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
-  int,
-  unsigned int
->;
+using hip_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
 /*!
  * List of types where HIP builtin atomicAdd is used to implement atomicSub.
@@ -503,32 +489,32 @@ using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Add_builtin_types = ::camp::list<
-  unsigned long long,
-  float
+using hip_atomicSub_via_Add_builtin_types = ::camp::list<unsigned long long,
+                                                         float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                         ,
+                                                         double
 #endif
->;
+                                                         >;
 
 /*!
  * HIP atomicSub compare and swap loop implementation.
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
 /*!
  * HIP atomicSub builtin implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types> * =
+        nullptr>
 RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 {
   return ::atomicSub(acc, value);
@@ -537,8 +523,10 @@ RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 /*!
  * HIP atomicSub via atomicAdd builtin implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types> * =
+        nullptr>
 RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 {
   return ::atomicAdd(acc, -value);
@@ -551,21 +539,19 @@ RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 using hip_atomicMin_builtin_types = hip_atomicCommon_builtin_types;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
 {
   return hip_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc,
+      [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types> * = nullptr>
 RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
 {
   return ::atomicMin(acc, value);
@@ -578,21 +564,19 @@ RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
 using hip_atomicMax_builtin_types = hip_atomicCommon_builtin_types;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
 {
   return hip_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc,
+      [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types> * = nullptr>
 RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
 {
   return ::atomicMax(acc, value);
@@ -605,7 +589,7 @@ RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
 template <typename T>
 RAJA_INLINE __device__ T hip_atomicInc(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
+  return hip_atomicCAS_loop(acc, [value](T old) {
     return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
@@ -627,8 +611,9 @@ RAJA_INLINE __device__ T hip_atomicInc(T *acc)
 template <typename T>
 RAJA_INLINE __device__ T hip_atomicDec(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  return hip_atomicCAS_loop(acc, [value](T old) {
+    return old == static_cast<T>(0) || value < old ? value
+                                                   : old - static_cast<T>(1);
   });
 }
 
@@ -649,16 +634,16 @@ RAJA_INLINE __device__ T hip_atomicDec(T *acc)
 using hip_atomicAnd_builtin_types = hip_atomicCommon_builtin_types;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types> * = nullptr>
 RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
 {
   return ::atomicAnd(acc, value);
@@ -670,13 +655,12 @@ RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
  */
 using hip_atomicOr_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types> * = nullptr>
 RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 /*!
@@ -691,16 +675,16 @@ RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
 using hip_atomicXor_builtin_types = hip_atomicCommon_builtin_types;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
+          RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types> * =
+              nullptr>
 RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types> * = nullptr>
 RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
 {
   return ::atomicXor(acc, value);
@@ -721,8 +705,8 @@ RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(hip_atomic_explicit<host_policy>,
+                                          T *acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicLoad(acc);
@@ -733,8 +717,9 @@ atomicLoad(hip_atomic_explicit<host_policy>, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(hip_atomic_explicit<host_policy>,
+                                              T *acc,
+                                              T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   detail::hip_atomicStore(acc, value);
@@ -745,8 +730,9 @@ atomicStore(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(hip_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAdd(acc, value);
@@ -757,8 +743,9 @@ atomicAdd(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(hip_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicSub(acc, value);
@@ -769,8 +756,9 @@ atomicSub(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(hip_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMin(acc, value);
@@ -781,8 +769,9 @@ atomicMin(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(hip_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMax(acc, value);
@@ -793,8 +782,9 @@ atomicMax(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc, value);
@@ -805,8 +795,8 @@ atomicInc(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T *acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc);
@@ -817,8 +807,9 @@ atomicInc(hip_atomic_explicit<host_policy>, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc, value);
@@ -829,8 +820,8 @@ atomicDec(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T *acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc);
@@ -841,8 +832,9 @@ atomicDec(hip_atomic_explicit<host_policy>, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(hip_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAnd(acc, value);
@@ -853,8 +845,9 @@ atomicAnd(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(hip_atomic_explicit<host_policy>,
+                                        T *acc,
+                                        T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicOr(acc, value);
@@ -865,8 +858,9 @@ atomicOr(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(hip_atomic_explicit<host_policy>,
+                                         T *acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicXor(acc, value);
@@ -877,8 +871,9 @@ atomicXor(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(hip_atomic_explicit<host_policy>,
+                                              T *acc,
+                                              T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicExchange(acc, value);
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index a8c4cf53b9..671d1bae3f 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -27,24 +27,18 @@
 #if defined(RAJA_ENABLE_HIP)
 
 #include <algorithm>
-#include "hip/hip_runtime.h"
 
+#include "RAJA/index/IndexSet.hpp"
+#include "RAJA/internal/fault_tolerance.hpp"
 #include "RAJA/pattern/forall.hpp"
-
 #include "RAJA/pattern/params/forall.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/internal/fault_tolerance.hpp"
-
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #include "RAJA/policy/hip/policy.hpp"
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
-
-#include "RAJA/index/IndexSet.hpp"
-
+#include "RAJA/util/macros.hpp"
 #include "RAJA/util/resource.hpp"
+#include "RAJA/util/types.hpp"
+#include "hip/hip_runtime.h"
 
 namespace RAJA
 {
@@ -71,61 +65,87 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or named_usage::unspecified with "
+                "forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or named_usage::unspecified with "
+                "forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if ( len > (block_size * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (block_size * grid_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_hip_dim<dim>(dims.threads,
+                               static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_hip_dim<dim>(dims.blocks,
+                               static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or named_usage::unspecified with "
+                "forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if ( block_size == IdxT(0) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (block_size == IdxT(0)) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
@@ -133,21 +153,30 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or named_usage::unspecified with "
+                "forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
@@ -157,19 +186,24 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker> {
+  using IndexGetter = ::RAJA::hip::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -178,20 +212,30 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or named_usage::unspecified with "
+                "forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or named_usage::unspecified with "
+                "forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
@@ -201,21 +245,30 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or named_usage::unspecified with "
+                "forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
@@ -225,21 +278,30 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or named_usage::unspecified with "
+                "forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
@@ -249,19 +311,24 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
-{
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker> {
+  using IndexGetter = ::RAJA::hip::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -291,14 +358,14 @@ template <typename EXEC_POL,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
@@ -315,14 +382,13 @@ template <typename EXEC_POL,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forall_hip_kernel(LOOP_BODY loop_body,
+                                  const Iterator idx,
+                                  IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
@@ -340,22 +406,22 @@ template <typename EXEC_POL,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  if (ii < length) {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -367,133 +433,134 @@ template <typename EXEC_POL,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  if (ii < length) {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>()) {
     body(idx[ii]);
   }
 }
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forall_hip_kernel(LOOP_BODY loop_body,
+                                  const Iterator idx,
+                                  IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>()) {
     body(idx[ii]);
   }
 }
 
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -508,27 +575,37 @@ void forallp_hip_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Hip>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam)
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::hip::
+      hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
+  using UniqueMarker = ::camp::
+      list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
+                                                              IterationGetter,
+                                                              Concretizer,
+                                                              UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -560,14 +637,20 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body =
+          RAJA::hip::make_launch_body(func,
+                                      dims.blocks,
+                                      dims.threads,
+                                      shmem,
+                                      hip_res,
+                                      std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::hip::launch(
+          func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
     }
 
     RAJA_FT_END;
@@ -577,27 +660,38 @@ forall_impl(resources::Hip hip_res,
 }
 
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Hip>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam f_params)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam f_params)
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::hip::
+      hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
+  using UniqueMarker = ::camp::
+      list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
+                                                              IterationGetter,
+                                                              Concretizer,
+                                                              UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -610,7 +704,10 @@ forall_impl(resources::Hip hip_res,
   if (len > 0) {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
+        &impl::forallp_hip_kernel<EXEC_POL,
+                                  Iterator,
+                                  LOOP_BODY,
+                                  IndexType,
                                   camp::decay<ForallParam>>);
 
     //
@@ -637,14 +734,23 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body =
+          RAJA::hip::make_launch_body(func,
+                                      dims.blocks,
+                                      dims.threads,
+                                      shmem,
+                                      hip_res,
+                                      std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {(void*)&body,
+                      (void*)&begin,
+                      (void*)&len,
+                      (void*)&f_params};
+      RAJA::hip::launch(
+          func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -675,22 +781,29 @@ forall_impl(resources::Hip hip_res,
  ******************************************************************************
  */
 template <typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Hip>
-forall_impl(resources::Hip r,
-            ExecPolicy<seq_segit, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
+    resources::Hip r,
+    ExecPolicy<
+        seq_segit,
+        ::RAJA::policy::hip::
+            hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, true>(),
-                     loop_body);
+    iset.segmentCall(
+        r,
+        isi,
+        detail::CallForall(),
+        ::RAJA::policy::hip::
+            hip_exec<IterationMapping, IterationGetter, Concretizer, true>(),
+        loop_body);
   }  // iterate over segments of index set
 
   if (!Async) RAJA::hip::synchronize(r);
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index c72a0b5c4f..5340e1d839 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -25,15 +25,14 @@
 
 #if defined(RAJA_ENABLE_HIP)
 
-#include <type_traits>
-
 #include <hip/hip_runtime.h>
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/SoAArray.hpp"
-#include "RAJA/util/types.hpp"
+#include <type_traits>
 
 #include "RAJA/policy/hip/policy.hpp"
+#include "RAJA/util/SoAArray.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 
 
 namespace RAJA
@@ -57,17 +56,10 @@ namespace impl
  *       so device scope fences are required to make memory accesses visible
  *       to the whole device.
  */
-struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
-{
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor {
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 /*!
@@ -90,24 +82,28 @@ struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
  *
  ******************************************************************************
  */
-struct AccessorDeviceScopeUseBlockFence
-{
+struct AccessorDeviceScopeUseBlockFence {
   // hip has 32 and 64 bit atomics
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::
+        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
+    auto ptr = const_cast<integer_type*>(
+        reinterpret_cast<const integer_type*>(in_ptr + idx));
 
     for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
-      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#if defined(RAJA_USE_HIP_INTRINSICS) && \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
+      u.array[i] = __hip_atomic_load(&ptr[i],
+                                     __ATOMIC_RELAXED,
+                                     __HIP_MEMORY_SCOPE_AGENT);
 #else
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
 #endif
@@ -116,10 +112,11 @@ struct AccessorDeviceScopeUseBlockFence
     return u.get_value();
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::
+        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -127,8 +124,12 @@ struct AccessorDeviceScopeUseBlockFence
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
     for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
-      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#if defined(RAJA_USE_HIP_INTRINSICS) && \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
+      __hip_atomic_store(&ptr[i],
+                         u.array[i],
+                         __ATOMIC_RELAXED,
+                         __HIP_MEMORY_SCOPE_AGENT);
 #else
       atomicExch(&ptr[i], u.array[i]);
 #endif
@@ -137,7 +138,8 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_acquire()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
+#if defined(RAJA_USE_HIP_INTRINSICS) && \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
     __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 #else
     __threadfence();
@@ -146,11 +148,13 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_release()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \
-                                        RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
     __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
     // Wait until all vmem operations complete (s_waitcnt vmcnt(0))
-    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8));
+    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) |
+                               (/*lgkmcnt*/ 0xf << 8));
 #else
     __threadfence();
 #endif
@@ -175,7 +179,9 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::
+      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
+          u;
   u.set_value(var);
 
   for (size_t i = 0; i < u.array_size(); ++i) {
@@ -187,7 +193,9 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::
+      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
+          u;
   u.set_value(var);
 
   for (size_t i = 0; i < u.array_size(); ++i) {
@@ -316,12 +324,18 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   // reduce per warp values
   if (numThreads > policy::hip::device_constants.WARP_SIZE) {
 
-    static_assert(policy::hip::device_constants.MAX_WARPS <= policy::hip::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
+    static_assert(policy::hip::device_constants.MAX_WARPS <=
+                      policy::hip::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
     RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
+        reinterpret_cast<
+            RAJA::detail::SoAArray<T,
+                                   policy::hip::device_constants.MAX_WARPS>*>(
+            tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0) {
diff --git a/include/RAJA/policy/hip/kernel/Conditional.hpp b/include/RAJA/policy/hip/kernel/Conditional.hpp
index 3204845544..292439676f 100644
--- a/include/RAJA/policy/hip/kernel/Conditional.hpp
+++ b/include/RAJA/policy/hip/kernel/Conditional.hpp
@@ -18,17 +18,14 @@
 #ifndef RAJA_policy_hip_kernel_Conditional_HPP
 #define RAJA_policy_hip_kernel_Conditional_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/Conditional.hpp"
-
 #include "RAJA/policy/hip/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -41,17 +38,14 @@ template <typename Data,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                            statement::If<Conditional, EnclosedStmts...>,
+                            Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     if (Conditional::eval(data)) {
 
@@ -61,10 +55,7 @@ struct HipStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index addb556b88..0d08b4642f 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -20,7 +20,6 @@
 #define RAJA_policy_hip_kernel_For_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/policy/hip/kernel/internal.hpp"
 
 
@@ -44,9 +43,11 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                   EnclosedStmts...>,
+    statement::For<
+        ArgumentId,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -60,10 +61,10 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t i = IndexMapper::template index<diff_t>();
 
@@ -74,8 +75,7 @@ struct HipStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -104,7 +104,9 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                  sync,
+                                                  IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -119,10 +121,10 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i = IndexMapper::template index<diff_t>();
@@ -138,8 +140,7 @@ struct HipStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -167,7 +168,10 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -181,12 +185,14 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -210,8 +216,7 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
@@ -239,7 +244,10 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -253,12 +261,14 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -277,8 +287,7 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -304,13 +313,18 @@ struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-: HipStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
+    : HipStatementExecutor<
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types> {
 };
 
 
@@ -322,33 +336,31 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -358,13 +370,11 @@ struct HipStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -379,7 +389,7 @@ struct HipStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -391,38 +401,36 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
     for (diff_t ii = 0; ii < len; ii += i_stride) {
@@ -441,9 +449,7 @@ struct HipStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -458,7 +464,7 @@ struct HipStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -470,30 +476,27 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -503,13 +506,11 @@ struct HipStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -525,7 +526,7 @@ struct HipStatementExecutor<
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -537,36 +538,33 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
     for (diff_t ii = 0; ii < len; ii += i_stride) {
@@ -585,9 +583,7 @@ struct HipStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -603,7 +599,7 @@ struct HipStatementExecutor<
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index 3342f994e0..44b1bc5437 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -20,7 +20,6 @@
 #define RAJA_policy_hip_kernel_ForICount_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/policy/hip/kernel/internal.hpp"
 
 
@@ -46,29 +45,37 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
+                                             sync,
+                                             IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
+                                         sync,
+                                         IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t i = IndexMapper::template index<diff_t>();
@@ -98,29 +105,35 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::
+                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -154,29 +167,41 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -217,29 +242,41 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -277,13 +314,18 @@ struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-: HipStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
+    : HipStatementExecutor<
+          Data,
+          statement::ForICount<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types> {
 };
 
 
@@ -296,40 +338,46 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_warp_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -340,9 +388,8 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -355,45 +402,51 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_warp_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
     for (diff_t ii = 0; ii < len; ii += i_stride) {
@@ -411,7 +464,6 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 
@@ -424,37 +476,42 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_thread_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -465,9 +522,8 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -480,42 +536,47 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_thread_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
     for (diff_t ii = 0; ii < len; ii += i_stride) {
@@ -533,7 +594,6 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 }  // namespace internal
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 1ed7740008..2be048584a 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -26,19 +26,15 @@
 #include <cassert>
 #include <climits>
 
-#include "camp/camp.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
 #include "RAJA/pattern/kernel.hpp"
 #include "RAJA/pattern/kernel/For.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
-
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
-#include "RAJA/policy/hip/policy.hpp"
-
 #include "RAJA/policy/hip/kernel/internal.hpp"
+#include "RAJA/policy/hip/policy.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -51,7 +47,8 @@ namespace RAJA
  *
  */
 template <bool async0, int num_blocks, int num_threads>
-struct hip_explicit_launch {};
+struct hip_explicit_launch {
+};
 
 /*!
  * HIP kernel launch policy where the user specifies the number of physical
@@ -87,7 +84,9 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct HipKernelExt
-    : public internal::Statement<::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>, EnclosedStmts...> {
+    : public internal::Statement<
+          ::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>,
+          EnclosedStmts...> {
 };
 
 
@@ -99,7 +98,8 @@ struct HipKernelExt
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExp =
-    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with the flexibility
@@ -109,7 +109,8 @@ using HipKernelExp =
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExpAsync =
-    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel using the
@@ -135,9 +136,9 @@ using HipKernelOccAsync =
  * The kernel launch is synchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixed =
-    HipKernelExt<hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
-                  EnclosedStmts...>;
+using HipKernelFixed = HipKernelExt<
+    hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with a fixed
@@ -145,8 +146,9 @@ using HipKernelFixed =
  * The kernel launch is asynchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixedAsync =
-    HipKernelExt<hip_explicit_launch<true, operators::limits<int>::max(), num_threads>, EnclosedStmts...>;
+using HipKernelFixedAsync = HipKernelExt<
+    hip_explicit_launch<true, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with 1024 threads
@@ -210,10 +212,10 @@ __launch_bounds__(BlockSize, 1) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<int BlockSize, typename Data, typename executor_t>
-struct HipKernelLauncherGetter
-{
-  using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
+template <int BlockSize, typename Data, typename executor_t>
+struct HipKernelLauncherGetter {
+  using type = camp::decay<
+      decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
@@ -224,10 +226,10 @@ struct HipKernelLauncherGetter
  * Helper class specialization for BlockSize == 0 and gets the unfixed max
  * block size version of the kernel.
  */
-template<typename Data, typename executor_t>
-struct HipKernelLauncherGetter<0, Data, executor_t>
-{
-  using type = camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
+template <typename Data, typename executor_t>
+struct HipKernelLauncherGetter<0, Data, executor_t> {
+  using type =
+      camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncher<Data, executor_t>;
@@ -235,12 +237,14 @@ struct HipKernelLauncherGetter<0, Data, executor_t>
 };
 
 
-
 /*!
  * Helper class that handles HIP kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct HipLaunchHelper;
 
 
@@ -249,24 +253,36 @@ struct HipLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the HIP occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, typename StmtList, typename Data, typename Types>
-struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,StmtList,Data,Types>
-{
+template <bool async0,
+          int num_blocks,
+          int num_threads,
+          typename StmtList,
+          typename Data,
+          typename Types>
+struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
+                       StmtList,
+                       Data,
+                       Types> {
   using Self = HipLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::hip_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::hip_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
+  using kernelGetter_t =
+      HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                              Data,
+                              executor_t>;
 
-  inline static const void* get_func()
+  inline static const void *get_func()
   {
-    return reinterpret_cast<const void*>(kernelGetter_t::get());
+    return reinterpret_cast<const void *>(kernelGetter_t::get());
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-      int &recommended_blocks, int &recommended_threads)
+                                                int &recommended_blocks,
+                                                int &recommended_threads)
   {
     auto func = Self::get_func();
 
@@ -278,8 +294,9 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         // determine blocks at runtime
         // determine threads at runtime
         //
-        auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
-            func, shmem_size);
+        auto data =
+            ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(func,
+                                                                shmem_size);
         recommended_blocks = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
 
@@ -293,8 +310,8 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
 
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
+        recommended_blocks =
+            data.func_max_blocks_per_sm * data.device_sm_per_device;
       }
 
     } else {
@@ -313,18 +330,17 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
-
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
-
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int &max_threads)
   {
     if (num_threads <= 0) {
 
@@ -340,12 +356,12 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
       // threads determined at compile-time
       //
       max_threads = num_threads;
-
     }
   }
 
   inline static void max_blocks(size_t shmem_size,
-      int &max_blocks, int actual_threads)
+                                int &max_blocks,
+                                int actual_threads)
   {
     auto func = Self::get_func();
 
@@ -354,14 +370,14 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 ||
-          num_threads != actual_threads) {
+      if (num_threads <= 0 || num_threads != actual_threads) {
 
         //
         // determine blocks when actual_threads != num_threads
         //
-        auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self>(
-            func, shmem_size, actual_threads);
+        auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self>(func,
+                                                                shmem_size,
+                                                                actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       } else {
@@ -372,7 +388,6 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
       }
 
     } else {
@@ -381,7 +396,6 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
-
     }
   }
 };
@@ -395,8 +409,10 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline
-hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum = hip_dim_t()){
+inline hip_dim_t fitHipDims(hip_dim_member_t limit,
+                            hip_dim_t result,
+                            hip_dim_t minimum = hip_dim_t())
+{
 
 
   // clamp things to at least 1
@@ -409,12 +425,12 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if(result.x * result.y * result.z <= limit) return result;
+  if (result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if(result.x * result.y * minimum.z < limit){
+  if (result.x * result.y * minimum.z < limit) {
     // compute a new z
-    result.z = limit / (result.x*result.y);
+    result.z = limit / (result.x * result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -422,9 +438,9 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
 
 
   // Can we reduce y to fit?
-  if(result.x * minimum.y * result.z < limit){
+  if (result.x * minimum.y * result.z < limit) {
     // compute a new y
-    result.y = limit / (result.x*result.z);
+    result.y = limit / (result.x * result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -432,9 +448,9 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
 
 
   // Can we reduce y to fit?
-  if(minimum.x * result.y * result.z < limit){
+  if (minimum.x * result.y * result.z < limit) {
     // compute a new x
-    result.x = limit / (result.y*result.z);
+    result.x = limit / (result.y * result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -449,18 +465,19 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using StatementType =
-      statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
+  using StatementType = statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
   static inline void exec(Data &&data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = HipLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -489,8 +506,9 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem,
+                                           recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -503,24 +521,24 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      hip_dim_t fit_threads{0,0,0};
+      hip_dim_t fit_threads{0, 0, 0};
 
-      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
-
-        fit_threads = fitHipDims(
-            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads >= get_size(launch_dims.min_dims.threads)) {
 
+        fit_threads = fitHipDims(recommended_threads,
+                                 launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if ( recommended_threads < max_threads &&
-           get_size(fit_threads) != recommended_threads ) {
-
-        fit_threads = fitHipDims(
-            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads < max_threads &&
+          get_size(fit_threads) != recommended_threads) {
 
+        fit_threads = fitHipDims(max_threads,
+                                 launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -534,7 +552,7 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if ( launch_dims.num_threads() == recommended_threads ) {
+      if (launch_dims.num_threads() == recommended_threads) {
 
         //
         // Fit the requested blocks
@@ -547,11 +565,11 @@ struct StatementExecutor<
         // Fit the max blocks
         //
         use_blocks = max_blocks;
-
       }
 
-      launch_dims.dims.blocks = fitHipDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitHipDims(use_blocks,
+                                           launch_dims.dims.blocks,
+                                           launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -560,7 +578,7 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if(launch_dims.num_threads() > max_threads){
+      if (launch_dims.num_threads() > max_threads) {
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -575,13 +593,23 @@ struct StatementExecutor<
         // currently an unresolved issue.
         //
         auto hip_data = RAJA::hip::make_launch_body(func,
-            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
+                                                    launch_dims.dims.blocks,
+                                                    launch_dims.dims.threads,
+                                                    shmem,
+                                                    res,
+                                                    data);
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&hip_data};
-        RAJA::hip::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
+        void *args[] = {(void *)&hip_data};
+        RAJA::hip::launch(func,
+                          launch_dims.dims.blocks,
+                          launch_dims.dims.threads,
+                          args,
+                          shmem,
+                          res,
+                          launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/hip/kernel/Hyperplane.hpp b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
index 5c428f03ab..e9280b1cbb 100644
--- a/include/RAJA/policy/hip/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
@@ -18,17 +18,14 @@
 #ifndef RAJA_policy_hip_kernel_Hyperplane_HPP
 #define RAJA_policy_hip_kernel_Hyperplane_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "camp/camp.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/Hyperplane.hpp"
-
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -41,33 +38,30 @@ template <typename Data,
           camp::idx_t... Args,
           typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<Data,
-                             statement::Hyperplane<HpArgumentId,
-                                                   seq_exec,
-                                                   ArgList<Args...>,
-                                                   EnclosedStmts...>,
-                            Types> {
+struct HipStatementExecutor<
+    Data,
+    statement::
+        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t =
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len = segment_length<HpArgumentId>(data) +
-                 foldl(RAJA::operators::plus<int>(),
-                               segment_length<Args>(data)...);
+    int hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-        camp::get<Args>(data.offset_tuple)...);
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -93,18 +87,13 @@ struct HipStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-
-
 }  // end namespace internal
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
index bbb8d6081b..ab65cf5da2 100644
--- a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
@@ -19,16 +19,14 @@
 #ifndef RAJA_policy_hip_kernel_InitLocalMem_HPP
 #define RAJA_policy_hip_kernel_InitLocalMem_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/InitLocalMem.hpp"
 #include "RAJA/policy/hip/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -39,27 +37,31 @@ struct hip_shared_mem;
 namespace internal
 {
 
-//Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+// Intialize thread shared array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<Data,
-                            statement::InitLocalMem<RAJA::hip_shared_mem, camp::idx_seq<Indices...>,
-                            EnclosedStmts...>,
-                            Types>
-{
+                            statement::InitLocalMem<RAJA::hip_shared_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
+                            Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -67,40 +69,35 @@ struct HipStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -108,47 +105,48 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
-//Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
-{
+// Intialize thread private array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::InitLocalMem<RAJA::hip_thread_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
+                            Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -156,40 +154,35 @@ struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -197,31 +190,24 @@ struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Lambda.hpp b/include/RAJA/policy/hip/kernel/Lambda.hpp
index d04fb11bf6..eb5cdb4dae 100644
--- a/include/RAJA/policy/hip/kernel/Lambda.hpp
+++ b/include/RAJA/policy/hip/kernel/Lambda.hpp
@@ -26,13 +26,11 @@
 #include <cassert>
 #include <climits>
 
-#include "camp/camp.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
 #include "RAJA/pattern/kernel.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 
 namespace RAJA
@@ -40,30 +38,32 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct HipStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::Lambda<LambdaIndex, Args...>,
+                            Types> {
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active) {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const &RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Reduce.hpp b/include/RAJA/policy/hip/kernel/Reduce.hpp
index a518073e7c..e227d9a760 100644
--- a/include/RAJA/policy/hip/kernel/Reduce.hpp
+++ b/include/RAJA/policy/hip/kernel/Reduce.hpp
@@ -20,7 +20,6 @@
 #define RAJA_policy_hip_kernel_Reduce_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/policy/hip/kernel/internal.hpp"
 
 
@@ -35,16 +34,17 @@ namespace internal
 // Executor that handles reductions across a single HIP thread block
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::Reduce<RAJA::hip_block_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
-                           Types> {
+                            statement::Reduce<RAJA::hip_block_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
+                            Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -67,13 +67,12 @@ struct HipStatementExecutor<Data,
     // reduction objects
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
 
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if(thread_active){
+    if (thread_active) {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -94,15 +93,16 @@ struct HipStatementExecutor<Data,
 // Executor that handles reductions across a single HIP thread warp
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::Reduce<RAJA::hip_warp_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
+                            statement::Reduce<RAJA::hip_warp_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -125,13 +125,12 @@ struct HipStatementExecutor<Data,
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if(thread_active){
+    if (thread_active) {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -148,7 +147,6 @@ struct HipStatementExecutor<Data,
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Sync.hpp b/include/RAJA/policy/hip/kernel/Sync.hpp
index d54a5ccf83..4c705eec1e 100644
--- a/include/RAJA/policy/hip/kernel/Sync.hpp
+++ b/include/RAJA/policy/hip/kernel/Sync.hpp
@@ -27,12 +27,10 @@
 #include <cassert>
 #include <climits>
 
-#include "camp/camp.hpp"
-
 #include "RAJA/pattern/kernel.hpp"
-
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 
 namespace RAJA
@@ -60,15 +58,11 @@ namespace internal
 template <typename Data, typename Types>
 struct HipStatementExecutor<Data, statement::HipSyncThreads, Types> {
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &, bool) { __syncthreads(); }
+  static inline RAJA_DEVICE void exec(Data &, bool) { __syncthreads(); }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const &RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
@@ -77,15 +71,15 @@ struct HipStatementExecutor<Data, statement::HipSyncThreads, Types> {
 template <typename Data, typename Types>
 struct HipStatementExecutor<Data, statement::HipSyncWarp, Types> {
 
-  static
-  inline
-  RAJA_DEVICE
-  //not currently supported
-  void exec(Data &, bool) {  }
+  static inline RAJA_DEVICE
+      // not currently supported
+      void
+      exec(Data &, bool)
+  {
+  }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const &RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 55653ddfe5..7e97e84e69 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -27,16 +27,14 @@
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/pattern/kernel/Tile.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 #include "camp/camp.hpp"
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/pattern/kernel/Tile.hpp"
-#include "RAJA/pattern/kernel/internal.hpp"
-
 namespace RAJA
 {
 namespace internal
@@ -56,12 +54,13 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -69,10 +68,11 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -80,7 +80,8 @@ struct HipStatementExecutor<
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Keep copy of original segment, so we can restore it
     segment_t orig_segment = segment;
@@ -95,12 +96,12 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -116,9 +117,9 @@ struct HipStatementExecutor<
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
 
-    // NOTE: We do not detect improper uses of direct_unchecked policies under tiling.
-    // This happens when using a direct unchecked policy on a tiled range that is not
-    // evenly divisible by chunk_size.
+    // NOTE: We do not detect improper uses of direct_unchecked policies under
+    // tiling. This happens when using a direct unchecked policy on a tiled
+    // range that is not evenly divisible by chunk_size.
     LaunchDims enclosed_dims =
         enclosed_stmts_t::calculateDimensions(private_data);
 
@@ -142,10 +143,11 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                   sync,
+                                                   IndexMapper>,
                     EnclosedStmts...>,
-                    Types>
-  {
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -153,10 +155,11 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -165,7 +168,8 @@ struct HipStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -184,12 +188,12 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -225,11 +229,15 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -237,10 +245,13 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -251,8 +262,10 @@ struct HipStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
@@ -274,12 +287,12 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -315,11 +328,15 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -327,10 +344,13 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -341,8 +361,10 @@ struct HipStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
@@ -359,12 +381,12 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -400,14 +422,21 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
-: HipStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : HipStatementExecutor<
+          Data,
+          statement::Tile<
+              ArgumentId,
+              TPol,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types> {
 };
 
 }  // end namespace internal
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index d73c71169e..0228ad4adc 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -27,16 +27,14 @@
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/pattern/kernel/Tile.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 #include "camp/camp.hpp"
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/pattern/kernel/Tile.hpp"
-#include "RAJA/pattern/kernel/internal.hpp"
-
 namespace RAJA
 {
 namespace internal
@@ -58,32 +56,40 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
+                                             sync,
+                                             IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
+                                         sync,
+                                         IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -125,32 +131,38 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::
+                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
                       EnclosedStmts...>,
-                      Types>;
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -196,32 +208,44 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -239,7 +263,7 @@ struct HipStatementExecutor<
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -274,32 +298,44 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -317,7 +353,7 @@ struct HipStatementExecutor<
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -346,14 +382,23 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
-: HipStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : HipStatementExecutor<
+          Data,
+          statement::TileTCount<
+              ArgumentId,
+              ParamId,
+              TPol,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types> {
 };
 
 }  // end namespace internal
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index b8a2f017b6..d6abc94642 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -27,15 +27,12 @@
 #include <cassert>
 #include <climits>
 
-#include "camp/camp.hpp"
-
 #include "RAJA/pattern/kernel.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #include "RAJA/policy/hip/policy.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 
 namespace RAJA
@@ -54,19 +51,16 @@ struct LaunchDims {
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(HipDims _dims)
-    : dims{_dims}
-    , min_dims{}
-  { }
+  LaunchDims(HipDims _dims) : dims{_dims}, min_dims{} {}
 
   RAJA_INLINE
   LaunchDims(HipDims _dims, HipDims _min_dims)
-    : dims{_dims}
-    , min_dims{_min_dims}
-  { }
+      : dims{_dims}, min_dims{_min_dims}
+  {
+  }
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -82,38 +76,38 @@ struct LaunchDims {
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x =
+        std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y =
+        std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z =
+        std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return dims.num_blocks();
-  }
+  int num_blocks() const { return dims.num_blocks(); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return dims.num_threads();
-  }
+  int num_threads() const { return dims.num_threads(); }
 
 
   RAJA_INLINE
-  void clamp_to_min_blocks() {
+  void clamp_to_min_blocks()
+  {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads() {
+  void clamp_to_min_threads()
+  {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
-
 };
 
 
@@ -126,7 +120,7 @@ struct HipStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -137,7 +131,7 @@ struct HipStatementListExecutorHelper {
 
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -154,13 +148,13 @@ template <camp::idx_t num_stmts, typename StmtList>
 struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, bool)
+  inline static RAJA_DEVICE void exec(Data&, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
@@ -182,20 +176,15 @@ struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute statements in order with helper class
-    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
+    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, thread_active);
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
     return HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
@@ -205,79 +194,90 @@ struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
 
 template <typename StmtList, typename Data, typename Types>
-using hip_statement_list_executor_t = HipStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using hip_statement_list_executor_t =
+    HipStatementListExecutor<Data, StmtList, Types>;
 
 
 // specialization for direct sequential policies
-template<typename kernel_indexer>
+template <typename kernel_indexer>
 struct KernelDimensionCalculator;
 
 // specialization for direct unchecked sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
-  {
-    if ( len != static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
+  {
+    if (len != static_cast<IdxT>(1)) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the direct_unchecked mapped index "
+          "space");
     }
   }
 };
 
 // specialization for direct unchecked thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len != static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
+    if (len != static_cast<IdxT>(IndexMapper::block_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the direct_unchecked mapped index "
+          "space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct unchecked block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -285,36 +285,45 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len != static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
+    if (len != static_cast<IdxT>(IndexMapper::grid_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the direct_unchecked mapped index "
+          "space");
     }
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct unchecked global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
     if (len != static_cast<IdxT>(0)) {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
@@ -322,141 +331,179 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    const IdxT block_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size));
-    if ( len != (block_size * static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    const IdxT block_size =
+        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size));
+    if (len != (block_size * static_cast<IdxT>(IndexMapper::grid_size))) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the direct_unchecked mapped index "
+          "space");
     }
     set_hip_dim<dim>(dims.threads, block_size);
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_hip_dim<dim>(min_dims.threads, block_size);
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    const IdxT grid_size = RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size));
-    if ( len != (static_cast<IdxT>(IndexMapper::block_size) * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
+    const IdxT grid_size =
+        RAJA_DIVIDE_CEILING_INT(len,
+                                static_cast<IdxT>(IndexMapper::block_size));
+    if (len != (static_cast<IdxT>(IndexMapper::block_size) * grid_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the direct_unchecked mapped index "
+          "space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, grid_size);
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(min_dims.blocks, grid_size);
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::DirectUnchecked,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len != (static_cast<IdxT>(IndexMapper::block_size) *
-                 static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len does not match the size of the direct_unchecked mapped index space");
+    if (len != (static_cast<IdxT>(IndexMapper::block_size) *
+                static_cast<IdxT>(IndexMapper::grid_size))) {
+      RAJA_ABORT_OR_THROW(
+          "len does not match the size of the direct_unchecked mapped index "
+          "space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for direct sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
-  {
-    if ( len > static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
+  {
+    if (len > static_cast<IdxT>(1)) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
   }
 };
 
 // specialization for direct thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::block_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -464,36 +511,44 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::grid_size)) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
     if (len > static_cast<IdxT>(0)) {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
@@ -501,127 +556,165 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
-                static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
+               static_cast<IdxT>(IndexMapper::grid_size))) {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for strided loop sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
   {
   }
 };
 
 // specialization for strided loop thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void set_dimensions(HipDims& dims,
+                             HipDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -629,32 +722,39 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void set_dimensions(HipDims& dims,
+                             HipDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
-{
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>> {
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     if (len > static_cast<IdxT>(0)) {
@@ -666,62 +766,86 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
-{
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>> {
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-
-  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+
+  using IndexMapper =
+      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
-{
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>> {
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void set_dimensions(HipDims& dims,
+                             HipDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index f5b4eda529..544857f793 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -18,10 +18,10 @@
 #ifndef RAJA_pattern_launch_hip_HPP
 #define RAJA_pattern_launch_hip_HPP
 
-#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/pattern/detail/privatizer.hpp"
-#include "RAJA/policy/hip/policy.hpp"
+#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
+#include "RAJA/policy/hip/policy.hpp"
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
 #include "RAJA/util/resource.hpp"
 
@@ -35,9 +35,9 @@ __global__ void launch_global_fcn(BODY body_in)
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto &body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -45,38 +45,44 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto &body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>> {
+struct LaunchExecute<
+    RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>> {
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void *>(&launch_global_fcn<BODY>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -84,18 +90,18 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
+    hip_dim_t gridSize{static_cast<hip_dim_member_t>(params.teams.value[0]),
+                       static_cast<hip_dim_member_t>(params.teams.value[1]),
+                       static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
+    hip_dim_t blockSize{static_cast<hip_dim_member_t>(params.threads.value[0]),
+                        static_cast<hip_dim_member_t>(params.threads.value[1]),
+                        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero) {
 
       RAJA_FT_BEGIN;
 
@@ -106,13 +112,24 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+                                                gridSize,
+                                                blockSize,
+                                                shared_mem_size,
+                                                hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void *args[] = {(void *)&body};
+        RAJA::hip::launch(func,
+                          gridSize,
+                          blockSize,
+                          args,
+                          shared_mem_size,
+                          hip_res,
+                          async,
+                          kernel_name);
       }
 
       RAJA_FT_END;
@@ -121,17 +138,22 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &launch_params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams &launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
+    auto func = reinterpret_cast<const void *>(
         &launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
@@ -140,18 +162,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
+    hip_dim_t gridSize{
+        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
+    hip_dim_t blockSize{
+        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero) {
 
       RAJA_FT_BEGIN;
 
@@ -163,22 +187,36 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
       launch_info.res = hip_res;
 
       {
-        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+                                                gridSize,
+                                                blockSize,
+                                                shared_mem_size,
+                                                hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
-
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        void *args[] = {(void *)&body, (void *)&launch_reducers};
+        RAJA::hip::launch(func,
+                          gridSize,
+                          blockSize,
+                          args,
+                          shared_mem_size,
+                          hip_res,
+                          async,
+                          kernel_name);
+
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -186,21 +224,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename BODY, int num_threads>
 __launch_bounds__(num_threads, 1) __global__
-void launch_global_fcn_fixed(BODY body_in)
+    void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto &body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -209,37 +246,43 @@ void launch_global_fcn_fixed(BODY body_in)
 
 template <typename BODY, int num_threads, typename ReduceParams>
 __launch_bounds__(num_threads, 1) __global__
-void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto &body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async, int nthreads>
 struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
+    auto func = reinterpret_cast<const void *>(
         &launch_global_fcn_fixed<BODY, nthreads>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
@@ -248,18 +291,18 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
+    hip_dim_t gridSize{static_cast<hip_dim_member_t>(params.teams.value[0]),
+                       static_cast<hip_dim_member_t>(params.teams.value[1]),
+                       static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
+    hip_dim_t blockSize{static_cast<hip_dim_member_t>(params.threads.value[0]),
+                        static_cast<hip_dim_member_t>(params.threads.value[1]),
+                        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero) {
 
       RAJA_FT_BEGIN;
 
@@ -270,13 +313,24 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+                                                gridSize,
+                                                blockSize,
+                                                shared_mem_size,
+                                                hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void *args[] = {(void *)&body};
+        RAJA::hip::launch(func,
+                          gridSize,
+                          blockSize,
+                          args,
+                          shared_mem_size,
+                          hip_res,
+                          async,
+                          kernel_name);
       }
 
       RAJA_FT_END;
@@ -285,18 +339,25 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &launch_params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams &launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, camp::decay<ReduceParams>>);
+    auto func = reinterpret_cast<const void *>(
+        &launch_new_reduce_global_fcn_fixed<BODY,
+                                            nthreads,
+                                            camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -304,18 +365,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
+    hip_dim_t gridSize{
+        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
+    hip_dim_t blockSize{
+        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero) {
 
       RAJA_FT_BEGIN;
 
@@ -327,22 +390,36 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
       launch_info.res = hip_res;
 
       {
-        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+                                                gridSize,
+                                                blockSize,
+                                                shared_mem_size,
+                                                hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
-
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        void *args[] = {(void *)&body, (void *)&launch_reducers};
+        RAJA::hip::launch(func,
+                          gridSize,
+                          blockSize,
+                          args,
+                          shared_mem_size,
+                          hip_res,
+                          async,
+                          kernel_name);
+
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -350,7 +427,6 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
@@ -358,18 +434,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
    HIP generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t i = IndexMapper::template index<diff_t>();
 
@@ -378,13 +456,15 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -400,15 +480,20 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -429,18 +514,20 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i = IndexMapper::template index<diff_t>();
@@ -452,13 +539,15 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -479,15 +568,20 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -514,18 +608,21 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init = IndexMapper::template index<diff_t>();
@@ -538,13 +635,16 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -566,22 +666,27 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 
       for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -623,12 +728,14 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
    HIP generic loop_icount implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -642,13 +749,15 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -660,21 +769,24 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    body(*(segment0.begin() + i0),
-         *(segment1.begin() + i1),
-         i0, i1);
+    body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -691,17 +803,21 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     body(*(segment0.begin() + i0),
          *(segment1.begin() + i1),
          *(segment2.begin() + i2),
-         i0, i1, i2);
+         i0,
+         i1,
+         i2);
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -718,13 +834,15 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -740,22 +858,25 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
     if (i0 < len0 && i1 < len1) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           i0, i1);
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -777,18 +898,23 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
       body(*(segment0.begin() + i0),
            *(segment1.begin() + i1),
            *(segment2.begin() + i2),
-           i0, i1, i2);
+           i0,
+           i1,
+           i2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -807,13 +933,16 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -835,23 +964,27 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 
       for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1),
-             i0, i1);
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -882,7 +1015,9 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
           body(*(segment0.begin() + i0),
                *(segment1.begin() + i1),
                *(segment2.begin() + i2),
-               i0, i1, i2);
+               i0,
+               i1,
+               i2);
         }
       }
     }
@@ -893,27 +1028,30 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 /*
    HIP generic flattened loop implementations
 */
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  sync,
-                                                  IndexMapper0>,
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<
+                       RAJA::iteration_mapping::DirectUnchecked,
+                       sync,
+                       IndexMapper0>,
                    SEGMENT>
-{};
+    : LoopExecute<RAJA::policy::hip::hip_indexer<
+                      RAJA::iteration_mapping::DirectUnchecked,
+                      sync,
+                      IndexMapper0>,
+                  SEGMENT> {
+};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<
+                       RAJA::iteration_mapping::DirectUnchecked,
+                       kernel_sync_requirement::none,
+                       IndexMapper0,
+                       IndexMapper1>,
+                   SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -924,23 +1062,27 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
     body(*(segment.begin() + i));
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<
+                       RAJA::iteration_mapping::DirectUnchecked,
+                       kernel_sync_requirement::none,
+                       IndexMapper0,
+                       IndexMapper1,
+                       IndexMapper2>,
+                   SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -953,33 +1095,35 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
     body(*(segment.begin() + i));
   }
 };
 
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
-{};
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           sync,
+                                           IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::hip::
+              hip_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
+          SEGMENT> {
+};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1>,
+    SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -992,7 +1136,7 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
     if (i < len) {
       body(*(segment.begin() + i));
@@ -1000,17 +1144,21 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1,
+                                           IndexMapper2>,
+    SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -1025,7 +1173,7 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
     if (i < len) {
       body(*(segment.begin() + i));
@@ -1033,27 +1181,33 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
   }
 };
 
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
-{};
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        sync,
+        IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::hip::hip_indexer<
+              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+              sync,
+              IndexMapper0>,
+          SEGMENT> {
+};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -1067,25 +1221,28 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*i1;
-         i < len;
-         i += i0_stride*i1_stride) {
+    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride) {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
-{
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT> {
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment,
@@ -1101,9 +1258,8 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
-         i < len;
-         i += i0_stride*i1_stride*i2_stride) {
+    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
+         i += i0_stride * i1_stride * i2_stride) {
       body(*(segment.begin() + i));
     }
   }
@@ -1114,12 +1270,14 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
    HIP generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1128,20 +1286,23 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
       SEGMENT const &segment,
       BODY const &body)
   {
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
     body(segment.slice(i, static_cast<diff_t>(tile_size)));
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1152,23 +1313,30 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
       SEGMENT const &segment1,
       BODY const &body)
   {
-    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i0 = IndexMapper0::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size1);
 
     body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
          segment1.slice(i1, static_cast<diff_t>(tile_size1)));
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1181,9 +1349,12 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
       SEGMENT const &segment2,
       BODY const &body)
   {
-    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
-    const diff_t i2 = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+    const diff_t i0 = IndexMapper0::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size1);
+    const diff_t i2 = IndexMapper2::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size2);
 
     body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
          segment1.slice(i1, static_cast<diff_t>(tile_size1)),
@@ -1192,12 +1363,14 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1207,7 +1380,8 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
       BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
     if (i < len) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
@@ -1216,13 +1390,15 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1236,8 +1412,10 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
     const diff_t len0 = segment0.end() - segment0.begin();
     const diff_t len1 = segment1.end() - segment1.begin();
 
-    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i0 = IndexMapper0::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size1);
 
     if (i0 < len0 && i1 < len1) {
       body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
@@ -1246,15 +1424,20 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1271,9 +1454,12 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
     const diff_t len1 = segment1.end() - segment1.begin();
     const diff_t len2 = segment2.end() - segment2.begin();
 
-    const diff_t i0 = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1 = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
-    const diff_t i2 = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+    const diff_t i0 = IndexMapper0::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size0);
+    const diff_t i1 = IndexMapper1::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size1);
+    const diff_t i2 = IndexMapper2::template index<diff_t>() *
+                      static_cast<diff_t>(tile_size2);
 
     if (i0 < len0 && i1 < len1 && i2 < len2) {
       body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
@@ -1284,12 +1470,15 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1299,8 +1488,10 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
       BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
     for (diff_t i = i_init; i < len; i += i_stride) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
@@ -1309,13 +1500,16 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1329,11 +1523,15 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
     const diff_t len0 = segment0.end() - segment0.begin();
     const diff_t len1 = segment1.end() - segment1.begin();
 
-    const diff_t i0_init = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1_init = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i0_init = IndexMapper0::template index<diff_t>() *
+                           static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = IndexMapper1::template index<diff_t>() *
+                           static_cast<diff_t>(tile_size1);
 
-    const diff_t i0_stride = IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1_stride = IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i0_stride =
+        IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride =
+        IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
 
     for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
       for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
@@ -1344,15 +1542,21 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1369,13 +1573,19 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
     const diff_t len1 = segment1.end() - segment1.begin();
     const diff_t len2 = segment2.end() - segment2.begin();
 
-    const diff_t i0_init = IndexMapper0::template index<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1_init = IndexMapper1::template index<diff_t>() * static_cast<diff_t>(tile_size1);
-    const diff_t i2_init = IndexMapper2::template index<diff_t>() * static_cast<diff_t>(tile_size2);
+    const diff_t i0_init = IndexMapper0::template index<diff_t>() *
+                           static_cast<diff_t>(tile_size0);
+    const diff_t i1_init = IndexMapper1::template index<diff_t>() *
+                           static_cast<diff_t>(tile_size1);
+    const diff_t i2_init = IndexMapper2::template index<diff_t>() *
+                           static_cast<diff_t>(tile_size2);
 
-    const diff_t i0_stride = IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
-    const diff_t i1_stride = IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
-    const diff_t i2_stride = IndexMapper2::template size<diff_t>() * static_cast<diff_t>(tile_size2);
+    const diff_t i0_stride =
+        IndexMapper0::template size<diff_t>() * static_cast<diff_t>(tile_size0);
+    const diff_t i1_stride =
+        IndexMapper1::template size<diff_t>() * static_cast<diff_t>(tile_size1);
+    const diff_t i2_stride =
+        IndexMapper2::template size<diff_t>() * static_cast<diff_t>(tile_size2);
 
     for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
       for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
@@ -1394,12 +1604,14 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
    HIP generic tile_tcount implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1416,13 +1628,15 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1441,19 +1655,25 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 
     body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
          segment1.slice(i1, static_cast<diff_t>(tile_size1)),
-         t0, t1);
+         t0,
+         t1);
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::DirectUnchecked,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1477,17 +1697,21 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
          segment1.slice(i1, static_cast<diff_t>(tile_size1)),
          segment2.slice(i2, static_cast<diff_t>(tile_size2)),
-         t0, t1, t2);
+         t0,
+         t1,
+         t2);
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1507,13 +1731,15 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1536,20 +1762,26 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     if (i0 < len0 && i1 < len1) {
       body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
            segment1.slice(i1, static_cast<diff_t>(tile_size1)),
-           t0, t1);
+           t0,
+           t1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1578,18 +1810,23 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
       body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
            segment1.slice(i1, static_cast<diff_t>(tile_size1)),
            segment2.slice(i2, static_cast<diff_t>(tile_size2)),
-           t0, t1, t2);
+           t0,
+           t1,
+           t2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1611,13 +1848,16 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1643,25 +1883,34 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i0_stride = t0_stride * static_cast<diff_t>(tile_size0);
     const diff_t i1_stride = t1_stride * static_cast<diff_t>(tile_size1);
 
-    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) {
-      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) {
+    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0;
+         i0 += i0_stride, t0 += t0_stride) {
+      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1;
+           i1 += i1_stride, t1 += t1_stride) {
         body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
              segment1.slice(i1, static_cast<diff_t>(tile_size1)),
-             t0, t1);
+             t0,
+             t1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT> {
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
@@ -1694,13 +1943,18 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i1_stride = t1_stride * static_cast<diff_t>(tile_size1);
     const diff_t i2_stride = t2_stride * static_cast<diff_t>(tile_size2);
 
-    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0; i0 += i0_stride, t0 += t0_stride) {
-      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1; i1 += i1_stride, t1 += t1_stride) {
-        for (diff_t i2 = i2_init, t2 = t2_init; i2 < len2; i2 += i2_stride, t2 += t2_stride) {
+    for (diff_t i0 = i0_init, t0 = t0_init; i0 < len0;
+         i0 += i0_stride, t0 += t0_stride) {
+      for (diff_t i1 = i1_init, t1 = t1_init; i1 < len1;
+           i1 += i1_stride, t1 += t1_stride) {
+        for (diff_t i2 = i2_init, t2 = t2_init; i2 < len2;
+             i2 += i2_stride, t2 += t2_stride) {
           body(segment0.slice(i0, static_cast<diff_t>(tile_size0)),
                segment1.slice(i1, static_cast<diff_t>(tile_size1)),
                segment2.slice(i2, static_cast<diff_t>(tile_size2)),
-               t0, t1, t2);
+               t0,
+               t1,
+               t2);
         }
       }
     }
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
index 0d9d3899d8..67bd53cddc 100644
--- a/include/RAJA/policy/hip/multi_reduce.hpp
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -25,30 +25,27 @@
 
 #if defined(RAJA_ENABLE_HIP)
 
-#include <type_traits>
 #include <limits>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
-#include "hip/hip_runtime.h"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/math.hpp"
-#include "RAJA/util/mutex.hpp"
-#include "RAJA/util/types.hpp"
-#include "RAJA/util/reduce.hpp"
-#include "RAJA/util/OffsetOperators.hpp"
-
 #include "RAJA/pattern/detail/multi_reduce.hpp"
 #include "RAJA/pattern/multi_reduce.hpp"
-
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #include "RAJA/policy/hip/intrinsics.hpp"
+#include "RAJA/util/OffsetOperators.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/math.hpp"
+#include "RAJA/util/mutex.hpp"
+#include "RAJA/util/reduce.hpp"
+#include "RAJA/util/types.hpp"
+#include "hip/hip_runtime.h"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/hip/atomic.hpp"
+#include "RAJA/policy/hip/atomic.hpp"
 #endif
 
 #include "RAJA/policy/hip/policy.hpp"
@@ -73,32 +70,40 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner, typename GetTallyIndex,
-          typename T, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                                                      T identity,
-                                                                      int bin,
-                                                                      T value,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template <typename Combiner,
+          typename GetTallyIndex,
+          typename T,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(
+    int RAJA_UNUSED_ARG(num_bins),
+    T identity,
+    int bin,
+    T value,
+    T* tally_mem,
+    GetTallyOffset get_tally_offset,
+    int tally_replication,
+    int tally_bins)
 {
-  if (value == identity) { return; }
+  if (value == identity) {
+    return;
+  }
 
-  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_index =
+      GetTallyIndex::template index<int>();  // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  int tally_offset =
+      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
   RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
-                                                           T identity,
-                                                           T* shared_mem,
-                                                           int shared_replication)
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(
+    int num_bins,
+    T identity,
+    T* shared_mem,
+    int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -113,60 +118,71 @@ RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
 }
 
 //! combine value into shared memory
-template <typename Combiner, typename GetSharedIndex,
-          typename T, typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                                                     T identity,
-                                                                     int bin,
-                                                                     T value,
-                                                                     T* shared_mem,
-                                                                     GetSharedOffset get_shared_offset,
-                                                                     int shared_replication)
+template <typename Combiner,
+          typename GetSharedIndex,
+          typename T,
+          typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(
+    int num_bins,
+    T identity,
+    int bin,
+    T value,
+    T* shared_mem,
+    GetSharedOffset get_shared_offset,
+    int shared_replication)
 {
-  if (value == identity) { return; }
+  if (value == identity) {
+    return;
+  }
 
-  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_index =
+      GetSharedIndex::template index<int>();  // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset =
+      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
   RAJA::reduce::hip::atomic<Combiner>{}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
 template <typename Combiner,
-          typename T, typename GetSharedOffset, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                                                      T identity,
-                                                                      T* shared_mem,
-                                                                      GetSharedOffset get_shared_offset,
-                                                                      int shared_replication,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+          typename T,
+          typename GetSharedOffset,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(
+    int num_bins,
+    T identity,
+    T* shared_mem,
+    GetSharedOffset get_shared_offset,
+    int shared_replication,
+    T* tally_mem,
+    GetTallyOffset get_tally_offset,
+    int tally_replication,
+    int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                 (gridDim.x * gridDim.y) * blockIdx.z;
+                (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
   for (int bin = threadId; bin < num_bins; bin += numThreads) {
 
     T value = identity;
     for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
-      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      int shmem_offset =
+          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
       Combiner{}(value, shared_mem[shmem_offset]);
     }
 
     if (value != identity) {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      int tally_offset =
+          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
       RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
     }
-
   }
 }
 
@@ -182,30 +198,35 @@ RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bi
 
 //! MultiReduction data for Hip Offload -- stores value, host pointer
 template <typename Combiner, typename T, typename tuning>
-struct MultiReduceGridAtomicHostInit_TallyData
-{
+struct MultiReduceGridAtomicHostInit_TallyData {
   //! setup permanent settings, allocate and initialize tally memory
-  template < typename Container >
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
-      : m_tally_mem(nullptr)
-      , m_identity(identity)
-      , m_num_bins(container.size())
-      , m_tally_bins(get_tally_bins(m_num_bins))
-      , m_tally_replication(get_tally_replication())
+  template <typename Container>
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
+      : m_tally_mem(nullptr),
+        m_identity(identity),
+        m_num_bins(container.size()),
+        m_tally_bins(get_tally_bins(m_num_bins)),
+        m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(
+        container, identity, m_num_bins, m_tally_bins, m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
   ~MultiReduceGridAtomicHostInit_TallyData() = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
-  template < typename Container >
+  template <typename Container>
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
@@ -214,19 +235,22 @@ struct MultiReduceGridAtomicHostInit_TallyData
       m_num_bins = new_num_bins;
       m_tally_bins = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+      m_tally_mem = create_tally(
+          container, identity, m_num_bins, m_tally_bins, m_tally_replication);
     } else {
       {
         int tally_rep = 0;
         int bin = 0;
         for (auto const& value : container) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+          m_tally_mem[GetTallyOffset{}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
           ++bin;
         }
       }
       for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
         for (int bin = 0; bin < m_num_bins; ++bin) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+          m_tally_mem[GetTallyOffset{}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
         }
       }
     }
@@ -244,9 +268,10 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-          reducer(m_identity);
+        reducer(m_identity);
     for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
-      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+      int tally_offset =
+          GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -258,20 +283,27 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(size_t(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-                                                       size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(
+      size_t(
+          policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+      size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size =
+      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
   using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using TallyAtomicReplicationConcretizer =
+      typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
+  using GetTallyOffset_rebind =
+      typename GetTallyOffset_rebind_rebunch::template rebunch<
+          s_tally_bunch_size>;
 
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
+           s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -285,35 +317,40 @@ struct MultiReduceGridAtomicHostInit_TallyData
       int func_min_global_replication;
     } func_data{min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}.template
-        get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer{}
+        .template get_global_replication<int>(func_data);
   }
 
-  template < typename Container >
-  static T* create_tally(Container const& container, T const& identity,
-                         int num_bins, int tally_bins, int tally_replication)
+  template <typename Container>
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
     if (num_bins == size_t(0)) {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication*tally_bins, s_tally_alignment);
+        tally_replication * tally_bins, s_tally_alignment);
 
     if (tally_replication > 0) {
       {
         int tally_rep = 0;
         int bin = 0;
         for (auto const& value : container) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(value);
+          int tally_offset =
+              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
       for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
         for (int bin = 0; bin < num_bins; ++bin) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(identity);
+          int tally_offset =
+              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -321,15 +358,20 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins, int tally_bins, int tally_replication)
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
     if (num_bins == size_t(0)) {
       return;
     }
 
-    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
+    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep) {
       for (int bin = num_bins; bin > 0; --bin) {
-        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+        int tally_offset = GetTallyOffset{}(bin - 1,
+                                            tally_bins,
+                                            tally_rep - 1,
+                                            tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -345,43 +387,40 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp threads
+  int m_tally_replication;  // power of 2, at least the max number of omp
+                            // threads
 };
 
 
 //! MultiReduction data for Hip Offload -- stores value, host pointer
 template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
-    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
-{
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning> {
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::TallyData;
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::TallyData;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
-  { }
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
 
   //! teardown per launch, do nothing
-  void teardown_launch()
-  { }
+  void teardown_launch() {}
 
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device()
-  { }
+  void setup_device() {}
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device()
-  { }
+  void finalize_device() {}
 
 
   //! combine value on device, combine a value into the tally atomically
@@ -389,9 +428,14 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity,
-        bin, value,
-        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+        m_num_bins,
+        m_identity,
+        bin,
+        value,
+        m_tally_mem,
+        GetTallyOffset{},
+        m_tally_replication,
+        m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -401,7 +445,8 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    int tally_offset =
+        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner{}(m_tally_mem[tally_offset], value);
   }
 
@@ -409,10 +454,10 @@ struct MultiReduceGridAtomicHostInit_Data
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
@@ -420,32 +465,38 @@ struct MultiReduceGridAtomicHostInit_Data
 //! MultiReduction data for Hip Offload -- stores value, host pointer
 template <typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
-    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
-{
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning> {
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template < typename Container >
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
-      : TallyData(container, identity)
-      , m_shared_offset(s_shared_offset_unknown)
-      , m_shared_replication(0)
-  { }
+  template <typename Container>
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
+      : TallyData(container, identity),
+        m_shared_offset(s_shared_offset_unknown),
+        m_shared_replication(0)
+  {
+  }
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
   ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
@@ -456,18 +507,18 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     }
 
     size_t shared_replication = 0;
-    const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size) {
-
-      struct {
-        size_t func_threads_per_block;
-        size_t func_max_shared_replication_per_block;
-      } func_data{block_size, max_shmem_size / m_num_bins};
-
-      shared_replication = SharedAtomicReplicationConcretizer{}.template
-          get_shared_replication<size_t>(func_data);
-      return m_num_bins * shared_replication;
-    });
+    const size_t shared_offset =
+        allocateDynamicShmem<T>([&](size_t max_shmem_size) {
+          struct {
+            size_t func_threads_per_block;
+            size_t func_max_shared_replication_per_block;
+          } func_data{block_size, max_shmem_size / m_num_bins};
+
+          shared_replication =
+              SharedAtomicReplicationConcretizer{}
+                  .template get_shared_replication<size_t>(func_data);
+          return m_num_bins * shared_replication;
+        });
 
     if (shared_offset != dynamic_smem_allocation_failure) {
       m_shared_replication = static_cast<int>(shared_replication);
@@ -491,9 +542,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   {
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr) {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity,
-          shared_mem, m_shared_replication);
+      impl::block_multi_reduce_init_shmem(m_num_bins,
+                                          m_identity,
+                                          shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -504,9 +556,15 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr) {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity,
-          shared_mem, GetSharedOffset{}, m_shared_replication,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins,
+          m_identity,
+          shared_mem,
+          GetSharedOffset{},
+          m_shared_replication,
+          m_tally_mem,
+          GetTallyOffset{},
+          m_tally_replication,
+          m_tally_bins);
     }
   }
 
@@ -518,14 +576,23 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr) {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          shared_mem, GetSharedOffset{}, m_shared_replication);
+          m_num_bins,
+          m_identity,
+          bin,
+          value,
+          shared_mem,
+          GetSharedOffset{},
+          m_shared_replication);
     } else {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins,
+          m_identity,
+          bin,
+          value,
+          m_tally_mem,
+          GetTallyOffset{},
+          m_tally_replication,
+          m_tally_bins);
     }
   }
 
@@ -536,13 +603,15 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    int tally_offset =
+        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner{}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
+  using SharedAtomicReplicationConcretizer =
+      typename shared_tuning::AtomicReplicationConcretizer;
   using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
@@ -551,18 +620,20 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown =
+      std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid =
+      std::numeric_limits<int>::max() - 1;
 
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset; // in bytes
-  int m_shared_replication; // power of 2
+  int m_shared_offset;       // in bytes
+  int m_shared_replication;  // power of 2
 
 
   RAJA_DEVICE
@@ -595,19 +666,28 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
-struct MultiReduceDataHip
-{
-  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+template <typename T, typename t_MultiReduceOp, typename tuning>
+struct MultiReduceDataHip {
+  static constexpr bool atomic_available =
+      RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type =
-      std::conditional_t<(atomic_available),
-        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
-          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
-            hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-            void>>,
+  using reduce_data_type = std::conditional_t<
+      (atomic_available),
+      std::conditional_t<
+          (tuning::algorithm ==
+           multi_reduce_algorithm::
+               init_host_combine_block_atomic_then_grid_atomic),
+          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                           T,
+                                                           tuning>,
+          std::conditional_t<
+              (tuning::algorithm ==
+               multi_reduce_algorithm::init_host_combine_global_atomic),
+              hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                      T,
+                                                      tuning>,
+              void>>,
       void>;
 
 
@@ -619,13 +699,14 @@ struct MultiReduceDataHip
 
   MultiReduceDataHip() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataHip>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataHip>::value>* = nullptr>
   MultiReduceDataHip(Container const& container, T identity)
-      : m_parent(this)
-      , m_sync_list(new SyncList)
-      , m_data(container, identity)
-      , m_own_launch_data(false)
+      : m_parent(this),
+        m_sync_list(new SyncList),
+        m_data(container, identity),
+        m_own_launch_data(false)
   {
   }
 
@@ -639,9 +720,10 @@ struct MultiReduceDataHip
 #else
       : m_parent(&other)
 #endif
-      , m_sync_list(other.m_sync_list)
-      , m_data(other.m_data)
-      , m_own_launch_data(false)
+        ,
+        m_sync_list(other.m_sync_list),
+        m_data(other.m_data),
+        m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     if (m_parent) {
@@ -661,9 +743,9 @@ struct MultiReduceDataHip
 #endif
   }
 
-  MultiReduceDataHip(MultiReduceDataHip &&) = delete;
+  MultiReduceDataHip(MultiReduceDataHip&&) = delete;
   MultiReduceDataHip& operator=(MultiReduceDataHip const&) = delete;
-  MultiReduceDataHip& operator=(MultiReduceDataHip &&) = delete;
+  MultiReduceDataHip& operator=(MultiReduceDataHip&&) = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -695,7 +777,7 @@ struct MultiReduceDataHip
   }
 
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
@@ -729,7 +811,7 @@ struct MultiReduceDataHip
 
 
 private:
-  MultiReduceDataHip const *m_parent;
+  MultiReduceDataHip const* m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
@@ -755,7 +837,8 @@ struct MultiReduceDataHip
 
 }  // end namespace hip
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy, hip::MultiReduceDataHip)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy,
+                                hip::MultiReduceDataHip)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp
index 30269f8406..4f5fd33023 100644
--- a/include/RAJA/policy/hip/params/kernel_name.hpp
+++ b/include/RAJA/policy/hip/params/kernel_name.hpp
@@ -3,50 +3,57 @@
 
 #if defined(RAJA_HIP_ACTIVE)
 
-#include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #include "RAJA/pattern/params/kernel_name.hpp"
+#include "RAJA/policy/hip/MemUtils_HIP.hpp"
 
 #if defined(RAJA_ENABLE_ROCTX)
 #include "hip/hip_runtime_api.h"
 #include "roctx.h"
 #endif
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  init(KernelName& kn, const RAJA::hip::detail::hipInfo &)
-  {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL> > init(
+    KernelName &kn,
+    const RAJA::hip::detail::hipInfo &)
+{
 #if defined(RAJA_ENABLE_ROCTX)
-    roctxRangePush(kn.name);
+  roctxRangePush(kn.name);
 #else
-    RAJA_UNUSED_VAR(kn);
+  RAJA_UNUSED_VAR(kn);
 #endif
-  }
-
-  // Combine
-  template<typename EXEC_POL>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  combine(KernelName&) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(KernelName&, const RAJA::hip::detail::hipInfo &)
-  {
+}
+
+// Combine
+template <typename EXEC_POL>
+RAJA_HOST_DEVICE camp::concepts::enable_if<
+    type_traits::is_hip_policy<EXEC_POL> >
+combine(KernelName &)
+{
+}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL> > resolve(
+    KernelName &,
+    const RAJA::hip::detail::hipInfo &)
+{
 #if defined(RAJA_ENABLE_ROCTX)
-    roctxRangePop();
+  roctxRangePop();
 #endif
-  }
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index a3da07ee2c..209c71f1b7 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -4,58 +4,64 @@
 #if defined(RAJA_HIP_ACTIVE)
 
 #include <hip/hip_runtime.h>
+
+#include "RAJA/pattern/params/reducer.hpp"
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #include "RAJA/policy/hip/reduce.hpp"
-#include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
-  {
-    red.devicetarget = RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
-    red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& red)
-  {
-    RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter,OP>( red.devicetarget,
-                                                                            red.getVal(),
-                                                                            red.device_mem,
-                                                                            red.device_count);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
-  {
-    // complete reduction
-    hi.res.wait();
-
-    red.combineTarget(*red.devicetarget);
-
-    // free memory
-    RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
-    red.device_count = nullptr;
-    red.device_mem.deallocate();
-    RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
-    red.devicetarget = nullptr;
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL> > init(
+    Reducer<OP, T, VOp>& red,
+    RAJA::hip::detail::hipInfo& hi)
+{
+  red.devicetarget =
+      RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
+  red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
+  red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance()
+                         .template malloc<unsigned int>(1);
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+RAJA_HOST_DEVICE camp::concepts::enable_if<
+    type_traits::is_hip_policy<EXEC_POL> >
+combine(Reducer<OP, T, VOp>& red)
+{
+  RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(
+      red.devicetarget, red.getVal(), red.device_mem, red.device_count);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL> > resolve(
+    Reducer<OP, T, VOp>& red,
+    RAJA::hip::detail::hipInfo& hi)
+{
+  // complete reduction
+  hi.res.wait();
+
+  red.combineTarget(*red.devicetarget);
+
+  // free memory
+  RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
+  red.device_count = nullptr;
+  red.device_mem.deallocate();
+  RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
+  red.devicetarget = nullptr;
+}
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 6d24ab5667..1d87b76c53 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -23,17 +23,15 @@
 #if defined(RAJA_HIP_ACTIVE)
 
 #include <utility>
-#include "hip/hip_runtime.h"
 
 #include "RAJA/pattern/reduce.hpp"
-
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/policy/sequential/policy.hpp"
-
-#include "RAJA/util/Operators.hpp"
 #include "RAJA/util/OffsetOperators.hpp"
-#include "RAJA/util/types.hpp"
+#include "RAJA/util/Operators.hpp"
 #include "RAJA/util/math.hpp"
+#include "RAJA/util/types.hpp"
+#include "hip/hip_runtime.h"
 
 namespace RAJA
 {
@@ -70,16 +68,16 @@ namespace hip
 {
 
 /// Type representing thread and block indexing within a grid
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template<typename ...indexers>
+template <typename... indexers>
 struct IndexFlatten;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexDivide;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexModulo;
 
 
@@ -89,15 +87,15 @@ struct IndexModulo;
  * Note that the maximum occupancy of the kernel may be less than the maximum
  * occupancy of the device in terms of total threads.
  */
-struct MaxOccupancyConcretizer
-{
-  template < typename IdxT, typename Data >
+struct MaxOccupancyConcretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_sm_per_device = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -110,10 +108,9 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-struct FractionOffsetOccupancyConcretizer
-{
-  template < typename IdxT, typename Data >
+template <typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+struct FractionOffsetOccupancyConcretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
@@ -125,11 +122,14 @@ struct FractionOffsetOccupancyConcretizer
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
-      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
+        IdxT(0)) {
+      func_max_blocks_per_sm =
+          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -143,22 +143,23 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template < typename AvoidMaxOccupancyConcretizer >
-struct AvoidDeviceMaxThreadOccupancyConcretizer
-{
-  template < typename IdxT, typename Data >
+template <typename AvoidMaxOccupancyConcretizer>
+struct AvoidDeviceMaxThreadOccupancyConcretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm =
+        func_threads_per_block * func_max_blocks_per_sm;
 
     if (func_max_threads_per_sm < device_max_threads_per_sm) {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
     } else {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
+          data);
     }
   }
 };
@@ -167,10 +168,9 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template < size_t preferred_replication >
-struct ConstantPreferredReplicationConcretizer
-{
-  template < typename IdxT, typename Data >
+template <size_t preferred_replication>
+struct ConstantPreferredReplicationConcretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -182,11 +182,11 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
-                            size_t preferred_replication_after_cutoff >
-struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
-{
-  template < typename IdxT, typename Data >
+template <size_t t_cutoff,
+          size_t preferred_replication_before_cutoff,
+          size_t preferred_replication_after_cutoff>
+struct ThreadsPerBlockCutoffPreferredReplicationConcretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
     IdxT cutoff = t_cutoff;
@@ -205,19 +205,20 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
-struct SharedAtomicReplicationMaxPow2Concretizer
-{
-  template < typename IdxT, typename Data >
+template <typename GetPreferredReplication>
+struct SharedAtomicReplicationMaxPow2Concretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block =
+        data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+            data);
 
-    return prev_pow2(std::min(preferred_replication,
-                              func_max_shared_replication_per_block));
+    return prev_pow2(
+        std::min(preferred_replication, func_max_shared_replication_per_block));
   }
 };
 
@@ -226,39 +227,36 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
-struct GlobalAtomicReplicationMinPow2Concretizer
-{
-  template < typename IdxT, typename Data >
+template <typename GetPreferredReplication>
+struct GlobalAtomicReplicationMinPow2Concretizer {
+  template <typename IdxT, typename Data>
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+            data);
 
-    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(
+        std::max(preferred_replication, func_min_global_replication));
   }
 };
 
 
-enum struct reduce_algorithm : int
-{
+enum struct reduce_algorithm : int {
   combine_last_block,
   init_device_combine_atomic_block,
   init_host_combine_atomic_block
 };
 
-enum struct block_communication_mode : int
-{
-  device_fence,
-  block_fence
-};
+enum struct block_communication_mode : int { device_fence, block_fence };
 
-template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
-           size_t t_replication, size_t t_atomic_stride >
-struct ReduceTuning
-{
+template <reduce_algorithm t_algorithm,
+          block_communication_mode t_comm_mode,
+          size_t t_replication,
+          size_t t_atomic_stride>
+struct ReduceTuning {
   static constexpr reduce_algorithm algorithm = t_algorithm;
   static constexpr block_communication_mode comm_mode = t_comm_mode;
   static constexpr size_t replication = t_replication;
@@ -268,27 +266,24 @@ struct ReduceTuning
 };
 
 
-enum struct multi_reduce_algorithm : int
-{
+enum struct multi_reduce_algorithm : int {
   init_host_combine_block_atomic_then_grid_atomic,
   init_host_combine_global_atomic
 };
 
-template < typename t_AtomicReplicationConcretizer,
-           typename t_ReplicationIndexer,
-           typename t_OffsetCalculator >
-struct AtomicReplicationTuning
-{
+template <typename t_AtomicReplicationConcretizer,
+          typename t_ReplicationIndexer,
+          typename t_OffsetCalculator>
+struct AtomicReplicationTuning {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
   using ReplicationIndexer = t_ReplicationIndexer;
   using OffsetCalculator = t_OffsetCalculator;
 };
 
-template < multi_reduce_algorithm t_algorithm,
-           typename t_SharedAtomicReplicationTuning,
-           typename t_GlobalAtomicReplicationTuning >
-struct MultiReduceTuning
-{
+template <multi_reduce_algorithm t_algorithm,
+          typename t_SharedAtomicReplicationTuning,
+          typename t_GlobalAtomicReplicationTuning>
+struct MultiReduceTuning {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
@@ -302,21 +297,24 @@ namespace policy
 namespace hip
 {
 
-struct DeviceConstants
-{
+struct DeviceConstants {
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+  RAJA::Index_type
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
+                                             // the cache level that handles
+                                             // atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-    : WARP_SIZE(warp_size)
-    , MAX_BLOCK_SIZE(max_block_size)
-    , MAX_WARPS(max_block_size / warp_size)
-    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  { }
+      : WARP_SIZE(warp_size),
+        MAX_BLOCK_SIZE(max_block_size),
+        MAX_WARPS(max_block_size / warp_size),
+        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  {
+  }
 };
 
 //
@@ -324,38 +322,49 @@ struct DeviceConstants
 // values for HIP warp size and max block size.
 //
 #if defined(__HIP_PLATFORM_AMD__)
-constexpr DeviceConstants device_constants(RAJA_HIP_WAVESIZE, 1024, 64); // MI300A
-// constexpr DeviceConstants device_constants(RAJA_HIP_WAVESIZE, 1024, 128); // MI250X
+constexpr DeviceConstants device_constants(RAJA_HIP_WAVESIZE,
+                                           1024,
+                                           64);  // MI300A
+// constexpr DeviceConstants device_constants(RAJA_HIP_WAVESIZE, 1024, 128); //
+// MI250X
 
 #elif defined(__HIP_PLATFORM_NVIDIA__)
-constexpr DeviceConstants device_constants(32, 1024, 32); // V100
+constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
 #endif
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
 
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct hip_indexer {};
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct hip_indexer {
+};
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
 struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
-  RAJA::Policy::hip,
-  RAJA::Pattern::region,
-  detail::get_launch<true /*async */>::value,
-  RAJA::Platform::hip> {
+                                 RAJA::Policy::hip,
+                                 RAJA::Pattern::region,
+                                 detail::get_launch<true /*async */>::value,
+                                 RAJA::Platform::hip> {
   using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
+template <typename _IterationMapping,
+          typename _IterationGetter,
+          typename _LaunchConcretizer,
           bool Async = false>
 struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::forall,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
+                      RAJA::Policy::hip,
+                      RAJA::Pattern::forall,
+                      detail::get_launch<Async>::value,
+                      RAJA::Platform::hip> {
   using IterationMapping = _IterationMapping;
   using IterationGetter = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
@@ -363,10 +372,10 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
 
 template <bool Async, int num_threads = named_usage::unspecified>
 struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
+                          RAJA::Policy::hip,
+                          RAJA::Pattern::region,
+                          detail::get_launch<Async>::value,
+                          RAJA::Platform::hip> {
 };
 
 
@@ -379,10 +388,10 @@ struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
 ///
 template <size_t BLOCK_SIZE, bool Async = false>
 struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::workgroup_exec,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
+                      RAJA::Policy::hip,
+                      RAJA::Pattern::workgroup_exec,
+                      detail::get_launch<Async>::value,
+                      RAJA::Platform::hip> {
 };
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
@@ -391,9 +400,9 @@ struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_hip_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::workgroup_order,
-                       RAJA::Platform::hip> {
+          RAJA::Policy::hip,
+          RAJA::Pattern::workgroup_order,
+          RAJA::Platform::hip> {
 };
 
 
@@ -405,36 +414,36 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template < typename tuning >
-struct hip_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
-                                                RAJA::Pattern::reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::hip,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
+template <typename tuning>
+struct hip_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
+                               RAJA::Policy::hip,
+                               RAJA::Pattern::reduce,
+                               detail::get_launch<false>::value,
+                               RAJA::Platform::hip,
+                               std::conditional_t<tuning::consistent,
+                                                  reduce::ordered,
+                                                  reduce::unordered>> {
 };
 
-template < typename tuning >
+template <typename tuning>
 struct hip_multi_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
-                                                RAJA::Pattern::multi_reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::hip,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::hip,
+          RAJA::Pattern::multi_reduce,
+          detail::get_launch<false>::value,
+          RAJA::Platform::hip,
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>> {
 };
 
 /*!
  * Hip atomic policy for using hip atomics on the device and
  * the provided policy on the host
  */
-template<typename host_policy>
-struct hip_atomic_explicit{};
+template <typename host_policy>
+struct hip_atomic_explicit {
+};
 
 /*!
  * Default hip atomic policy uses hip atomics on the device and non-atomics
@@ -445,11 +454,13 @@ using hip_atomic = hip_atomic_explicit<seq_atomic>;
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct hip_block_reduce{};
+struct hip_block_reduce {
+};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct hip_warp_reduce{};
+struct hip_warp_reduce {
+};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
@@ -463,15 +474,15 @@ struct hip_warp_reduce{};
 // struct hip_warp_loop{};
 
 
-
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
 // Multiple warps have to be created by using hip_thread_{yz}_*
 // Since we are masking specific threads, multiple nested
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct hip_warp_masked_direct {};
+template <typename Mask>
+struct hip_warp_masked_direct {
+};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
@@ -479,20 +490,23 @@ struct hip_warp_masked_direct {};
 // Since we are masking specific threads, multiple nested
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct hip_warp_masked_loop {};
+template <typename Mask>
+struct hip_warp_masked_loop {
+};
 
 
-template<typename Mask>
-struct hip_thread_masked_direct {};
+template <typename Mask>
+struct hip_thread_masked_direct {
+};
 
-template<typename Mask>
-struct hip_thread_masked_loop {};
+template <typename Mask>
+struct hip_thread_masked_loop {
+};
 
 
 struct hip_synchronize : make_policy_pattern_launch_t<Policy::hip,
-                                                       Pattern::synchronize,
-                                                       Launch::sync> {
+                                                      Pattern::synchronize,
+                                                      Launch::sync> {
 };
 
 }  // end namespace hip
@@ -505,18 +519,16 @@ namespace internal
 RAJA_INLINE
 int get_size(hip_dim_t dims)
 {
-  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
+  if (dims.x == 0 && dims.y == 0 && dims.z == 0) {
     return 0;
   }
-  return (dims.x ? dims.x : 1) *
-         (dims.y ? dims.y : 1) *
-         (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
 }
 
 struct HipDims {
 
-  hip_dim_t blocks{0,0,0};
-  hip_dim_t threads{0,0,0};
+  hip_dim_t blocks{0, 0, 0};
+  hip_dim_t threads{0, 0, 0};
 
   HipDims() = default;
   HipDims(HipDims const&) = default;
@@ -524,22 +536,20 @@ struct HipDims {
 
   RAJA_INLINE
   HipDims(hip_dim_member_t default_val)
-    : blocks{default_val, default_val, default_val}
-    , threads{default_val, default_val, default_val}
-  { }
+      : blocks{default_val, default_val, default_val},
+        threads{default_val, default_val, default_val}
+  {
+  }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return get_size(blocks);
-  }
+  int num_blocks() const { return get_size(blocks); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return get_size(threads);
-  }
+  int num_threads() const { return get_size(threads); }
 
   RAJA_INLINE
-  hip_dim_t get_blocks() const {
+  hip_dim_t get_blocks() const
+  {
     if (num_blocks() != 0) {
       return {(blocks.x ? blocks.x : 1),
               (blocks.y ? blocks.y : 1),
@@ -550,7 +560,8 @@ struct HipDims {
   }
 
   RAJA_INLINE
-  hip_dim_t get_threads() const {
+  hip_dim_t get_threads() const
+  {
     if (num_threads() != 0) {
       return {(threads.x ? threads.x : 1),
               (threads.y ? threads.y : 1),
@@ -561,101 +572,85 @@ struct HipDims {
   }
 };
 
-template<named_dim dim>
+template <named_dim dim>
 struct HipDimHelper;
 
-template<>
-struct HipDimHelper<named_dim::x>{
+template <>
+struct HipDimHelper<named_dim::x> {
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct HipDimHelper<named_dim::y>{
+template <>
+struct HipDimHelper<named_dim::y> {
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct HipDimHelper<named_dim::z>{
+template <>
+struct HipDimHelper<named_dim::z> {
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.z = value;
   }
 };
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-constexpr
-hip_dim_member_t get_hip_dim(dim_t const &d)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE constexpr hip_dim_member_t get_hip_dim(dim_t const& d)
 {
   return HipDimHelper<dim>::get(d);
 }
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-void set_hip_dim(dim_t &d, hip_dim_member_t value)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE void set_hip_dim(dim_t& d, hip_dim_member_t value)
 {
   return HipDimHelper<dim>::set(d, value);
 }
 
-} // namespace internal
+}  // namespace internal
 
 namespace hip
 {
 
 /// specify block size and grid size for one dimension at runtime
-struct IndexSize
-{
+struct IndexSize {
   hip_dim_member_t block_size = named_usage::unspecified;
   hip_dim_member_t grid_size = named_usage::unspecified;
 
-  RAJA_HOST_DEVICE constexpr
-  IndexSize(hip_dim_member_t _block_size = named_usage::unspecified,
-            hip_dim_member_t _grid_size = named_usage::unspecified)
-    : block_size(_block_size)
-    , grid_size(_grid_size)
-  { }
+  RAJA_HOST_DEVICE constexpr IndexSize(
+      hip_dim_member_t _block_size = named_usage::unspecified,
+      hip_dim_member_t _grid_size = named_usage::unspecified)
+      : block_size(_block_size), grid_size(_grid_size)
+  {
+  }
 };
 
 /// Type representing thread indexing within a grid
@@ -663,436 +658,436 @@ struct IndexSize
 
 /// useful for global indexing
 /// with fixed block size and fixed grid size
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
-struct IndexGlobal
-{
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+struct IndexGlobal {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size of 1 and fixed grid size
-template<named_dim dim, int GRID_SIZE>
-struct IndexGlobal<dim, 1, GRID_SIZE>
-{
+template <named_dim dim, int GRID_SIZE>
+struct IndexGlobal<dim, 1, GRID_SIZE> {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim, int BLOCK_SIZE>
-struct IndexGlobal<dim, BLOCK_SIZE, 1>
-{
+template <named_dim dim, int BLOCK_SIZE>
+struct IndexGlobal<dim, BLOCK_SIZE, 1> {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim>
-struct IndexGlobal<dim, 1, 1>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, 1, 1> {
   static constexpr int block_size = 1;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 /// with dynamic block size and fixed grid size
-template<named_dim dim, int GRID_SIZE>
-struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
-{
+template <named_dim dim, int GRID_SIZE>
+struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE> {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size);
   }
 };
 /// with dynamic block size and fixed grid size of 1
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::unspecified, 1>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::unspecified, 1> {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
   }
 };
 
 /// with fixed block size and dynamic grid size
-template<named_dim dim, int BLOCK_SIZE>
-struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
-{
+template <named_dim dim, int BLOCK_SIZE>
+struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified> {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 /// with fixed block size of 1 and dynamic grid size
-template<named_dim dim>
-struct IndexGlobal<dim, 1, named_usage::unspecified>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, 1, named_usage::unspecified> {
   static constexpr int block_size = 1;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// with dynamic block size and dynamic grid size
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified> {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing blocks (ignores thread indices)
 /// with fixed grid size
-template<named_dim dim, int GRID_SIZE>
-struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
-{
+template <named_dim dim, int GRID_SIZE>
+struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE> {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed grid sized of 1
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::ignored, 1>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::ignored, 1> {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic grid size
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified> {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing threads (ignores block indices)
 /// with fixed block size
-template<named_dim dim, int BLOCK_SIZE>
-struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
-{
+template <named_dim dim, int BLOCK_SIZE>
+struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored> {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size of 1
-template<named_dim dim>
-struct IndexGlobal<dim, 1, named_usage::ignored>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, 1, named_usage::ignored> {
   static constexpr int block_size = 1;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic block size
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored> {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
   }
 };
 
 /// useful for doing single threaded sequential tasks
 /// (ignores thread and block indices)
-template<named_dim dim>
-struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
-{
+template <named_dim dim>
+struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored> {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 // useful for flatten global index (includes x)
-template<typename x_index>
-struct IndexFlatten<x_index>
-{
+template <typename x_index>
+struct IndexFlatten<x_index> {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>();
+    return x_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y)
-template<typename x_index, typename y_index>
-struct IndexFlatten<x_index, y_index>
-{
+template <typename x_index, typename y_index>
+struct IndexFlatten<x_index, y_index> {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
-
+           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y,z)
-template<typename x_index, typename y_index, typename z_index>
-struct IndexFlatten<x_index, y_index, z_index>
-{
+template <typename x_index, typename y_index, typename z_index>
+struct IndexFlatten<x_index, y_index, z_index> {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
-                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
+           x_index::template size<IdxT>() *
+               (y_index::template index<IdxT>() +
+                y_index::template size<IdxT>() *
+                    z_index::template index<IdxT>());
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
+           z_index::template size<IdxT>();
   }
-
 };
 
-template<size_t divisor, typename indexer>
-struct IndexDivide
-{
-  template < typename IdxT = hip_dim_member_t >
+template <size_t divisor, typename indexer>
+struct IndexDivide {
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
-template<size_t divisor, typename indexer>
-struct IndexModulo
-{
-  template < typename IdxT = hip_dim_member_t >
+template <size_t divisor, typename indexer>
+struct IndexModulo {
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1101,125 +1096,120 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_thread;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
-struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
-{
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>> {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 };
 ///
 template <typename x_index, typename y_index, typename z_index>
-struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
-{
+struct get_index_thread<IndexFlatten<x_index, y_index, z_index>> {
   using type = IndexFlatten<typename get_index_thread<x_index>::type,
                             typename get_index_thread<y_index>::type,
                             typename get_index_thread<z_index>::type>;
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_block;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
-struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
-{
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>> {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 };
 ///
 template <typename x_index, typename y_index, typename z_index>
-struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
-{
+struct get_index_block<IndexFlatten<x_index, y_index, z_index>> {
   using type = IndexFlatten<typename get_index_block<x_index>::type,
                             typename get_index_block<y_index>::type,
                             typename get_index_block<z_index>::type>;
 };
 
 
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+template <size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template <size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
 template <size_t BLOCK_SIZE_X,
           size_t BLOCK_SIZE_Y,
           size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
-using warp_xyz = IndexDivide<WARP_SIZE,
-                             thread_xyz<BLOCK_SIZE_X,
-                                        BLOCK_SIZE_Y,
-                                        BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
-                                              BLOCK_SIZE_X,
-                                              BLOCK_SIZE_Y,
-                                              BLOCK_SIZE_Z>,
-                                     block_xyz<GRID_SIZE_X,
-                                               GRID_SIZE_Y,
-                                               GRID_SIZE_Z>>;
-
-} // namespace hip
+template <size_t WARP_SIZE = RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE = RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified,
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+
+}  // namespace hip
 
 // contretizers used in forall, scan, and sort policies
 
-using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using HipAvoidDeviceMaxThreadOccupancyConcretizer =
+    hip::AvoidDeviceMaxThreadOccupancyConcretizer<
+        hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template <typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+using HipFractionOffsetOccupancyConcretizer =
+    hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
 
-using HipReduceDefaultConcretizer = HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
+using HipReduceDefaultConcretizer =
+    HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
 
 using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
@@ -1227,83 +1217,111 @@ using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using hip_exec_grid = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using hip_exec_grid_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, Async>;
+using hip_exec = policy::hip::hip_exec<iteration_mapping::Direct,
+                                       hip::global_x<BLOCK_SIZE>,
+                                       HipDefaultConcretizer,
+                                       Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_async = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, true>;
+using hip_exec_async = policy::hip::hip_exec<iteration_mapping::Direct,
+                                             hip::global_x<BLOCK_SIZE>,
+                                             HipDefaultConcretizer,
+                                             true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_calc_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_max = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_max_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using hip_exec_occ_fraction = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
 using hip_exec_occ_fraction_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using hip_exec_occ_custom = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    Concretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    Concretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Concretizer>
 using hip_exec_occ_custom_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    Concretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    Concretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_with_reduce = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_with_reduce_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer,
+    true>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_base = std::conditional_t<with_reduce,
-    hip_exec_with_reduce<BLOCK_SIZE, Async>,
-    hip_exec<BLOCK_SIZE, Async>>;
+using hip_exec_base =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce<BLOCK_SIZE, Async>,
+                       hip_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using hip_exec_base_async = std::conditional_t<with_reduce,
-    hip_exec_with_reduce_async<BLOCK_SIZE>,
-    hip_exec_async<BLOCK_SIZE>>;
+using hip_exec_base_async =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce_async<BLOCK_SIZE>,
+                       hip_exec_async<BLOCK_SIZE>>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;
@@ -1319,10 +1337,10 @@ using policy::hip::hip_atomic_explicit;
 
 
 // policies usable with reducers
-template < hip::reduce_algorithm algorithm,
-           hip::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
+template <hip::reduce_algorithm algorithm,
+          hip::block_communication_mode comm_mode,
+          size_t replication = named_usage::unspecified,
+          size_t atomic_stride = named_usage::unspecified>
 using hip_reduce_tuning = policy::hip::hip_reduce_policy<
     hip::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1345,35 +1363,41 @@ using hip_reduce_tuning = policy::hip::hip_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using hip_reduce_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_device_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_device_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_host_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_host_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1385,25 +1409,26 @@ using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool with_atomic >
-using hip_reduce_base = std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
+template <bool with_atomic>
+using hip_reduce_base =
+    std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
 
 
 // policies usable with multi_reducers
-template < hip::multi_reduce_algorithm algorithm,
-           typename SharedAtomicReplicationConcretizer,
-           typename SharedAtomicReplicationIndexer,
-           typename GlobalAtomicReplicationConcretizer,
-           typename GlobalAtomicReplicationIndexer >
-using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
-    hip::MultiReduceTuning<
-      algorithm,
-      hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                    SharedAtomicReplicationIndexer,
-                                    GetOffsetRight<int>>,
-      hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                    GlobalAtomicReplicationIndexer,
-                                    GetOffsetLeft<int>>>>;
+template <hip::multi_reduce_algorithm algorithm,
+          typename SharedAtomicReplicationConcretizer,
+          typename SharedAtomicReplicationIndexer,
+          typename GlobalAtomicReplicationConcretizer,
+          typename GlobalAtomicReplicationIndexer>
+using hip_multi_reduce_tuning =
+    policy::hip::hip_multi_reduce_policy<hip::MultiReduceTuning<
+        algorithm,
+        hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                     SharedAtomicReplicationIndexer,
+                                     GetOffsetRight<int>>,
+        hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                     GlobalAtomicReplicationIndexer,
+                                     GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1416,44 +1441,51 @@ using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
 // - *host_init* policies initialize memory used with atomics on the host.
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the AMD MI300A El Capitan/Tuolumne systems.
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    hip::SharedAtomicReplicationMaxPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<4>>,
-    hip::thread_xyz<>,
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<32>>,
-    hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        hip::SharedAtomicReplicationMaxPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<4>>,
+        hip::thread_xyz<>,
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<32>>,
+        hip::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    hip::SharedAtomicReplicationMaxPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<0>>,
-    hip::thread_xyz<>,
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<32>>,
-    hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        hip::SharedAtomicReplicationMaxPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<0>>,
+        hip::thread_xyz<>,
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<32>>,
+        hip::warp_global_xyz<>>;
 //
 using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning<
     hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
+    void,  // unused with this algorithm
+    void,  // unused with this algorithm
     hip::GlobalAtomicReplicationMinPow2Concretizer<
         hip::ConstantPreferredReplicationConcretizer<32>>,
     hip::warp_global_xyz<>>;
 //
-using hip_multi_reduce_atomic_global_no_replication_host_init = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<1>>,
-    hip::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using hip_multi_reduce_atomic = hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely used
+using hip_multi_reduce_atomic_global_no_replication_host_init =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::init_host_combine_global_atomic,
+        void,  // unused with this algorithm
+        void,  // unused with this algorithm
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<1>>,
+        hip::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
+// the same answer every time when used in the same way
+using hip_multi_reduce_atomic =
+    hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely
+// used
 using hip_multi_reduce_atomic_low_performance_low_overhead =
     hip_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1489,43 +1521,43 @@ using policy::hip::hip_launch_t;
 
 
 // policies usable with kernel and launch
-template < typename ... indexers >
-using hip_indexer_direct_unchecked = policy::hip::hip_indexer<
-    iteration_mapping::DirectUnchecked,
-    kernel_sync_requirement::none,
-    indexers...>;
-
-template < typename ... indexers >
-using hip_indexer_direct = policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
-
-template < typename ... indexers >
+template <typename... indexers>
+using hip_indexer_direct_unchecked =
+    policy::hip::hip_indexer<iteration_mapping::DirectUnchecked,
+                             kernel_sync_requirement::none,
+                             indexers...>;
+
+template <typename... indexers>
+using hip_indexer_direct =
+    policy::hip::hip_indexer<iteration_mapping::Direct,
+                             kernel_sync_requirement::none,
+                             indexers...>;
+
+template <typename... indexers>
 using hip_indexer_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using hip_indexer_syncable_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template < typename ... indexers >
-using hip_flatten_indexer_direct_unchecked = policy::hip::hip_flatten_indexer<
-    iteration_mapping::DirectUnchecked,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using hip_flatten_indexer_direct_unchecked =
+    policy::hip::hip_flatten_indexer<iteration_mapping::DirectUnchecked,
+                                     kernel_sync_requirement::none,
+                                     indexers...>;
 
-template < typename ... indexers >
-using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using hip_flatten_indexer_direct =
+    policy::hip::hip_flatten_indexer<iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1533,48 +1565,83 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
 
 
 // helper to generate the many policy aliases
-#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, scope, mapping) \
-  \
-  using hip_##flatten##scope##_x_##mapping = hip_##flatten##scope##_##mapping<named_dim::x>; \
-  using hip_##flatten##scope##_y_##mapping = hip_##flatten##scope##_##mapping<named_dim::y>; \
-  using hip_##flatten##scope##_z_##mapping = hip_##flatten##scope##_##mapping<named_dim::z>; \
-  \
-  using hip_##flatten##scope##_xy_##mapping = hip_##flatten##scope##_##mapping<named_dim::x, named_dim::y>; \
-  using hip_##flatten##scope##_xz_##mapping = hip_##flatten##scope##_##mapping<named_dim::x, named_dim::z>; \
-  using hip_##flatten##scope##_yx_##mapping = hip_##flatten##scope##_##mapping<named_dim::y, named_dim::x>; \
-  using hip_##flatten##scope##_yz_##mapping = hip_##flatten##scope##_##mapping<named_dim::y, named_dim::z>; \
-  using hip_##flatten##scope##_zx_##mapping = hip_##flatten##scope##_##mapping<named_dim::z, named_dim::x>; \
-  using hip_##flatten##scope##_zy_##mapping = hip_##flatten##scope##_##mapping<named_dim::z, named_dim::y>; \
-  \
-  using hip_##flatten##scope##_xyz_##mapping = hip_##flatten##scope##_##mapping<named_dim::x, named_dim::y, named_dim::z>; \
-  using hip_##flatten##scope##_xzy_##mapping = hip_##flatten##scope##_##mapping<named_dim::x, named_dim::z, named_dim::y>; \
-  using hip_##flatten##scope##_yxz_##mapping = hip_##flatten##scope##_##mapping<named_dim::y, named_dim::x, named_dim::z>; \
-  using hip_##flatten##scope##_yzx_##mapping = hip_##flatten##scope##_##mapping<named_dim::y, named_dim::z, named_dim::x>; \
-  using hip_##flatten##scope##_zxy_##mapping = hip_##flatten##scope##_##mapping<named_dim::z, named_dim::x, named_dim::y>; \
-  using hip_##flatten##scope##_zyx_##mapping = hip_##flatten##scope##_##mapping<named_dim::z, named_dim::y, named_dim::x>;
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten,    \
+                                                        scope,      \
+                                                        mapping)    \
+                                                                    \
+  using hip_##flatten##scope##_x_##mapping =                        \
+      hip_##flatten##scope##_##mapping<named_dim::x>;               \
+  using hip_##flatten##scope##_y_##mapping =                        \
+      hip_##flatten##scope##_##mapping<named_dim::y>;               \
+  using hip_##flatten##scope##_z_##mapping =                        \
+      hip_##flatten##scope##_##mapping<named_dim::z>;               \
+                                                                    \
+  using hip_##flatten##scope##_xy_##mapping =                       \
+      hip_##flatten##scope##_##mapping<named_dim::x, named_dim::y>; \
+  using hip_##flatten##scope##_xz_##mapping =                       \
+      hip_##flatten##scope##_##mapping<named_dim::x, named_dim::z>; \
+  using hip_##flatten##scope##_yx_##mapping =                       \
+      hip_##flatten##scope##_##mapping<named_dim::y, named_dim::x>; \
+  using hip_##flatten##scope##_yz_##mapping =                       \
+      hip_##flatten##scope##_##mapping<named_dim::y, named_dim::z>; \
+  using hip_##flatten##scope##_zx_##mapping =                       \
+      hip_##flatten##scope##_##mapping<named_dim::z, named_dim::x>; \
+  using hip_##flatten##scope##_zy_##mapping =                       \
+      hip_##flatten##scope##_##mapping<named_dim::z, named_dim::y>; \
+                                                                    \
+  using hip_##flatten##scope##_xyz_##mapping =                      \
+      hip_##flatten##scope##_##mapping<named_dim::x,                \
+                                       named_dim::y,                \
+                                       named_dim::z>;               \
+  using hip_##flatten##scope##_xzy_##mapping =                      \
+      hip_##flatten##scope##_##mapping<named_dim::x,                \
+                                       named_dim::z,                \
+                                       named_dim::y>;               \
+  using hip_##flatten##scope##_yxz_##mapping =                      \
+      hip_##flatten##scope##_##mapping<named_dim::y,                \
+                                       named_dim::x,                \
+                                       named_dim::z>;               \
+  using hip_##flatten##scope##_yzx_##mapping =                      \
+      hip_##flatten##scope##_##mapping<named_dim::y,                \
+                                       named_dim::z,                \
+                                       named_dim::x>;               \
+  using hip_##flatten##scope##_zxy_##mapping =                      \
+      hip_##flatten##scope##_##mapping<named_dim::z,                \
+                                       named_dim::x,                \
+                                       named_dim::y>;               \
+  using hip_##flatten##scope##_zyx_##mapping =                      \
+      hip_##flatten##scope##_##mapping<named_dim::z,                \
+                                       named_dim::y,                \
+                                       named_dim::x>;
 
 // helper to generate the many thread policy aliases
-#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(flatten, mapping) \
-  template < named_dim ... dims > \
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(flatten, mapping)   \
+  template <named_dim... dims>                                              \
   using hip_##flatten##thread_##mapping = hip_##flatten##indexer_##mapping< \
-      hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>; \
-  \
+      hip::IndexGlobal<dims,                                                \
+                       named_usage::unspecified,                            \
+                       named_usage::ignored>...>;                           \
+                                                                            \
   RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, thread, mapping)
 
 // helper to generate the many block policy aliases
-#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(flatten, mapping) \
-  template < named_dim ... dims > \
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_POLICIES(flatten, mapping)   \
+  template <named_dim... dims>                                             \
   using hip_##flatten##block_##mapping = hip_##flatten##indexer_##mapping< \
-      hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>; \
-  \
+      hip::IndexGlobal<dims,                                               \
+                       named_usage::ignored,                               \
+                       named_usage::unspecified>...>;                      \
+                                                                           \
   RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, block, mapping)
 
 // helper to generate the many global policy aliases
-#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(flatten, mapping) \
-  template < named_dim ... dims > \
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(flatten, mapping)   \
+  template <named_dim... dims>                                              \
   using hip_##flatten##global_##mapping = hip_##flatten##indexer_##mapping< \
-      hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>; \
-  \
+      hip::IndexGlobal<dims,                                                \
+                       named_usage::unspecified,                            \
+                       named_usage::unspecified>...>;                       \
+                                                                            \
   RAJA_INTERNAL_HIP_ALIAS_INDEXER_POLICIES_HELPER(flatten, global, mapping)
 
 
@@ -1582,8 +1649,8 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
  * Maps segment indices to HIP threads, blocks, or global threads.
  * This is the lowest overhead mapping, but requires that there are the same
  * number of physical threads, blocks, or global threads as map requests.
- * For example, a segment of size 1000 will only fit into 1000 threads, blocks, or global threads, and
- * triggers a runtime error in some cases.
+ * For example, a segment of size 1000 will only fit into 1000 threads, blocks,
+ * or global threads, and triggers a runtime error in some cases.
  */
 RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(, direct_unchecked)
 
@@ -1594,9 +1661,9 @@ RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(, direct_unchecked)
 /*!
  * Maps segment indices to HIP threads, blocks, or global threads.
  * This is a low overhead mapping, but requires that there are enough
- * physical threads, blocks, or global threads to fit all of the direct map requests.
- * For example, a segment of size 2000 will not fit into 1024 threads, blocks,
- * or global threads, and triggers a runtime error in some cases.
+ * physical threads, blocks, or global threads to fit all of the direct map
+ * requests. For example, a segment of size 2000 will not fit into 1024 threads,
+ * blocks, or global threads, and triggers a runtime error in some cases.
  */
 RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_POLICIES(, direct)
 
@@ -1671,116 +1738,225 @@ RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_POLICIES(flatten_, loop)
 
 
 // helper to generate the many one size policy aliases
-#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, scope, mapping) \
-  template < int X_SIZE > \
-  using hip_##flatten##scope##_size_x_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>>; \
-  template < int Y_SIZE > \
-  using hip_##flatten##scope##_size_y_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>>; \
-  template < int Z_SIZE > \
-  using hip_##flatten##scope##_size_z_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>>; \
-  \
-  template < int X_SIZE, int Y_SIZE > \
-  using hip_##flatten##scope##_size_xy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>, hip::scope##_y<Y_SIZE>>; \
-  template < int X_SIZE, int Z_SIZE > \
-  using hip_##flatten##scope##_size_xz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>, hip::scope##_z<Z_SIZE>>; \
-  template < int Y_SIZE, int X_SIZE > \
-  using hip_##flatten##scope##_size_yx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>, hip::scope##_x<X_SIZE>>; \
-  template < int Y_SIZE, int Z_SIZE > \
-  using hip_##flatten##scope##_size_yz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>, hip::scope##_z<Z_SIZE>>; \
-  template < int Z_SIZE, int X_SIZE > \
-  using hip_##flatten##scope##_size_zx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>, hip::scope##_x<X_SIZE>>; \
-  template < int Z_SIZE, int Y_SIZE > \
-  using hip_##flatten##scope##_size_zy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>, hip::scope##_y<Y_SIZE>>; \
-  \
-  template < int X_SIZE, int Y_SIZE, int Z_SIZE > \
-  using hip_##flatten##scope##_size_xyz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>, hip::scope##_y<Y_SIZE>, hip::scope##_z<Z_SIZE>>; \
-  template < int X_SIZE, int Z_SIZE, int Y_SIZE > \
-  using hip_##flatten##scope##_size_xzy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>, hip::scope##_z<Z_SIZE>, hip::scope##_y<Y_SIZE>>; \
-  template < int Y_SIZE, int X_SIZE, int Z_SIZE > \
-  using hip_##flatten##scope##_size_yxz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>, hip::scope##_x<X_SIZE>, hip::scope##_z<Z_SIZE>>; \
-  template < int Y_SIZE, int Z_SIZE, int X_SIZE > \
-  using hip_##flatten##scope##_size_yzx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>, hip::scope##_z<Z_SIZE>, hip::scope##_x<X_SIZE>>; \
-  template < int Z_SIZE, int X_SIZE, int Y_SIZE > \
-  using hip_##flatten##scope##_size_zxy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>, hip::scope##_x<X_SIZE>, hip::scope##_y<Y_SIZE>>; \
-  template < int Z_SIZE, int Y_SIZE, int X_SIZE > \
-  using hip_##flatten##scope##_size_zyx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>, hip::scope##_y<Y_SIZE>, hip::scope##_x<X_SIZE>>;
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, \
+                                                                 scope,   \
+                                                                 mapping) \
+  template <int X_SIZE>                                                   \
+  using hip_##flatten##scope##_size_x_##mapping =                         \
+      hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>>;           \
+  template <int Y_SIZE>                                                   \
+  using hip_##flatten##scope##_size_y_##mapping =                         \
+      hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>>;           \
+  template <int Z_SIZE>                                                   \
+  using hip_##flatten##scope##_size_z_##mapping =                         \
+      hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>>;           \
+                                                                          \
+  template <int X_SIZE, int Y_SIZE>                                       \
+  using hip_##flatten##scope##_size_xy_##mapping =                        \
+      hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>,            \
+                                       hip::scope##_y<Y_SIZE>>;           \
+  template <int X_SIZE, int Z_SIZE>                                       \
+  using hip_##flatten##scope##_size_xz_##mapping =                        \
+      hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>,            \
+                                       hip::scope##_z<Z_SIZE>>;           \
+  template <int Y_SIZE, int X_SIZE>                                       \
+  using hip_##flatten##scope##_size_yx_##mapping =                        \
+      hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>,            \
+                                       hip::scope##_x<X_SIZE>>;           \
+  template <int Y_SIZE, int Z_SIZE>                                       \
+  using hip_##flatten##scope##_size_yz_##mapping =                        \
+      hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>,            \
+                                       hip::scope##_z<Z_SIZE>>;           \
+  template <int Z_SIZE, int X_SIZE>                                       \
+  using hip_##flatten##scope##_size_zx_##mapping =                        \
+      hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>,            \
+                                       hip::scope##_x<X_SIZE>>;           \
+  template <int Z_SIZE, int Y_SIZE>                                       \
+  using hip_##flatten##scope##_size_zy_##mapping =                        \
+      hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>,            \
+                                       hip::scope##_y<Y_SIZE>>;           \
+                                                                          \
+  template <int X_SIZE, int Y_SIZE, int Z_SIZE>                           \
+  using hip_##flatten##scope##_size_xyz_##mapping =                       \
+      hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>,            \
+                                       hip::scope##_y<Y_SIZE>,            \
+                                       hip::scope##_z<Z_SIZE>>;           \
+  template <int X_SIZE, int Z_SIZE, int Y_SIZE>                           \
+  using hip_##flatten##scope##_size_xzy_##mapping =                       \
+      hip_##flatten##indexer_##mapping<hip::scope##_x<X_SIZE>,            \
+                                       hip::scope##_z<Z_SIZE>,            \
+                                       hip::scope##_y<Y_SIZE>>;           \
+  template <int Y_SIZE, int X_SIZE, int Z_SIZE>                           \
+  using hip_##flatten##scope##_size_yxz_##mapping =                       \
+      hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>,            \
+                                       hip::scope##_x<X_SIZE>,            \
+                                       hip::scope##_z<Z_SIZE>>;           \
+  template <int Y_SIZE, int Z_SIZE, int X_SIZE>                           \
+  using hip_##flatten##scope##_size_yzx_##mapping =                       \
+      hip_##flatten##indexer_##mapping<hip::scope##_y<Y_SIZE>,            \
+                                       hip::scope##_z<Z_SIZE>,            \
+                                       hip::scope##_x<X_SIZE>>;           \
+  template <int Z_SIZE, int X_SIZE, int Y_SIZE>                           \
+  using hip_##flatten##scope##_size_zxy_##mapping =                       \
+      hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>,            \
+                                       hip::scope##_x<X_SIZE>,            \
+                                       hip::scope##_y<Y_SIZE>>;           \
+  template <int Z_SIZE, int Y_SIZE, int X_SIZE>                           \
+  using hip_##flatten##scope##_size_zyx_##mapping =                       \
+      hip_##flatten##indexer_##mapping<hip::scope##_z<Z_SIZE>,            \
+                                       hip::scope##_y<Y_SIZE>,            \
+                                       hip::scope##_x<X_SIZE>>;
 
 // helper to generate the many two size policy aliases
-#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten, scope, mapping) \
-  template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_x_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
-  template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_y_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
-  template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_z_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
-  \
-  template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, \
-             int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_xy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                       hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
-  template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, \
-             int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_xz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                       hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
-  template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, \
-             int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_yx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                       hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
-  template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, \
-             int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_yz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                       hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
-  template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, \
-             int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_zx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                       hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
-  template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, \
-             int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_zy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                       hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
-  \
-  template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, \
-             int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_xyz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                        hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                        hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
-  template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, \
-             int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_xzy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                        hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                        hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
-  template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE, \
-             int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_yxz_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                        hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                        hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>; \
-  template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE, \
-             int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_yzx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                        hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                        hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
-  template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE, \
-             int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_zxy_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                        hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>, \
-                                                        hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>; \
-  template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE, \
-             int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified > \
-  using hip_##flatten##scope##_size_zyx_##mapping = hip_##flatten##indexer_##mapping<hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>, \
-                                                        hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>, \
-                                                        hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>; \
+#define RAJA_INTERNAL_HIP_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten, \
+                                                                 scope,   \
+                                                                 mapping) \
+  template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified> \
+  using hip_##flatten##scope##_size_x_##mapping =                         \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;                     \
+  template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified> \
+  using hip_##flatten##scope##_size_y_##mapping =                         \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;                     \
+  template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified> \
+  using hip_##flatten##scope##_size_z_##mapping =                         \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;                     \
+                                                                          \
+  template <int X_BLOCK_SIZE,                                             \
+            int Y_BLOCK_SIZE,                                             \
+            int X_GRID_SIZE = named_usage::unspecified,                   \
+            int Y_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_xy_##mapping =                        \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;                     \
+  template <int X_BLOCK_SIZE,                                             \
+            int Z_BLOCK_SIZE,                                             \
+            int X_GRID_SIZE = named_usage::unspecified,                   \
+            int Z_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_xz_##mapping =                        \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;                     \
+  template <int Y_BLOCK_SIZE,                                             \
+            int X_BLOCK_SIZE,                                             \
+            int Y_GRID_SIZE = named_usage::unspecified,                   \
+            int X_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_yx_##mapping =                        \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;                     \
+  template <int Y_BLOCK_SIZE,                                             \
+            int Z_BLOCK_SIZE,                                             \
+            int Y_GRID_SIZE = named_usage::unspecified,                   \
+            int Z_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_yz_##mapping =                        \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;                     \
+  template <int Z_BLOCK_SIZE,                                             \
+            int X_BLOCK_SIZE,                                             \
+            int Z_GRID_SIZE = named_usage::unspecified,                   \
+            int X_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_zx_##mapping =                        \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;                     \
+  template <int Z_BLOCK_SIZE,                                             \
+            int Y_BLOCK_SIZE,                                             \
+            int Z_GRID_SIZE = named_usage::unspecified,                   \
+            int Y_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_zy_##mapping =                        \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;                     \
+                                                                          \
+  template <int X_BLOCK_SIZE,                                             \
+            int Y_BLOCK_SIZE,                                             \
+            int Z_BLOCK_SIZE,                                             \
+            int X_GRID_SIZE = named_usage::unspecified,                   \
+            int Y_GRID_SIZE = named_usage::unspecified,                   \
+            int Z_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_xyz_##mapping =                       \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;                     \
+  template <int X_BLOCK_SIZE,                                             \
+            int Z_BLOCK_SIZE,                                             \
+            int Y_BLOCK_SIZE,                                             \
+            int X_GRID_SIZE = named_usage::unspecified,                   \
+            int Z_GRID_SIZE = named_usage::unspecified,                   \
+            int Y_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_xzy_##mapping =                       \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;                     \
+  template <int Y_BLOCK_SIZE,                                             \
+            int X_BLOCK_SIZE,                                             \
+            int Z_BLOCK_SIZE,                                             \
+            int Y_GRID_SIZE = named_usage::unspecified,                   \
+            int X_GRID_SIZE = named_usage::unspecified,                   \
+            int Z_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_yxz_##mapping =                       \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;                     \
+  template <int Y_BLOCK_SIZE,                                             \
+            int Z_BLOCK_SIZE,                                             \
+            int X_BLOCK_SIZE,                                             \
+            int Y_GRID_SIZE = named_usage::unspecified,                   \
+            int Z_GRID_SIZE = named_usage::unspecified,                   \
+            int X_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_yzx_##mapping =                       \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;                     \
+  template <int Z_BLOCK_SIZE,                                             \
+            int X_BLOCK_SIZE,                                             \
+            int Y_BLOCK_SIZE,                                             \
+            int Z_GRID_SIZE = named_usage::unspecified,                   \
+            int X_GRID_SIZE = named_usage::unspecified,                   \
+            int Y_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_zxy_##mapping =                       \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>,                      \
+          hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;                     \
+  template <int Z_BLOCK_SIZE,                                             \
+            int Y_BLOCK_SIZE,                                             \
+            int X_BLOCK_SIZE,                                             \
+            int Z_GRID_SIZE = named_usage::unspecified,                   \
+            int Y_GRID_SIZE = named_usage::unspecified,                   \
+            int X_GRID_SIZE = named_usage::unspecified>                   \
+  using hip_##flatten##scope##_size_zyx_##mapping =                       \
+      hip_##flatten##indexer_##mapping<                                   \
+          hip::scope##_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,                      \
+          hip::scope##_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,                      \
+          hip::scope##_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 // helper to generate the many thread size policy aliases
 #define RAJA_INTERNAL_HIP_ALIAS_INDEXER_THREAD_SIZE_POLICIES(flatten, mapping) \
-    RAJA_INTERNAL_HIP_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, thread, mapping)
+  RAJA_INTERNAL_HIP_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten,            \
+                                                           thread,             \
+                                                           mapping)
 
 // helper to generate the many block size policy aliases
 #define RAJA_INTERNAL_HIP_ALIAS_INDEXER_BLOCK_SIZE_POLICIES(flatten, mapping) \
-    RAJA_INTERNAL_HIP_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten, block, mapping)
+  RAJA_INTERNAL_HIP_ALIAS_INDEXER_ONE_SIZE_POLICIES_HELPER(flatten,           \
+                                                           block,             \
+                                                           mapping)
 
 // helper to generate the many global size policy aliases
 #define RAJA_INTERNAL_HIP_ALIAS_INDEXER_GLOBAL_SIZE_POLICIES(flatten, mapping) \
-    RAJA_INTERNAL_HIP_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten, global, mapping)
+  RAJA_INTERNAL_HIP_ALIAS_INDEXER_TWO_SIZE_POLICIES_HELPER(flatten,            \
+                                                           global,             \
+                                                           mapping)
 
 
 /*!
diff --git a/include/RAJA/policy/hip/raja_hiperrchk.hpp b/include/RAJA/policy/hip/raja_hiperrchk.hpp
index 5e3a02fb2c..a26beb5b96 100644
--- a/include/RAJA/policy/hip/raja_hiperrchk.hpp
+++ b/include/RAJA/policy/hip/raja_hiperrchk.hpp
@@ -24,11 +24,11 @@
 
 #if defined(RAJA_ENABLE_HIP)
 
+#include <hip/hip_runtime.h>
+
 #include <iostream>
 #include <string>
 
-#include <hip/hip_runtime.h>
-
 #include "RAJA/util/macros.hpp"
 
 namespace RAJA
@@ -43,14 +43,14 @@ namespace RAJA
 ///////////////////////////////////////////////////////////////////////
 ///
 #define hipErrchk(ans)                            \
-  {                                                \
+  {                                               \
     ::RAJA::hipAssert((ans), __FILE__, __LINE__); \
   }
 
 inline void hipAssert(hipError_t code,
-                       const char *file,
-                       int line,
-                       bool abort = true)
+                      const char *file,
+                      int line,
+                      bool abort = true)
 {
   if (code != hipSuccess) {
     if (abort) {
@@ -63,8 +63,8 @@ inline void hipAssert(hipError_t code,
       msg += std::to_string(line);
       throw std::runtime_error(msg);
     } else {
-      fprintf(stderr, "HIPassert: %s %s %d\n",
-              hipGetErrorString(code), file, line);
+      fprintf(
+          stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file, line);
     }
   }
 }
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index c81adf8e24..d0842fca9a 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -25,26 +25,24 @@
 
 #if defined(RAJA_ENABLE_HIP)
 
-#include <type_traits>
-
 #include <hip/hip_runtime.h>
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/SoAArray.hpp"
-#include "RAJA/util/SoAPtr.hpp"
-#include "RAJA/util/basic_mempool.hpp"
-#include "RAJA/util/mutex.hpp"
-#include "RAJA/util/types.hpp"
-#include "RAJA/util/reduce.hpp"
+#include <type_traits>
 
 #include "RAJA/pattern/detail/reduce.hpp"
 #include "RAJA/pattern/reduce.hpp"
-
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
-#include "RAJA/policy/hip/intrinsics.hpp"
 #include "RAJA/policy/hip/atomic.hpp"
+#include "RAJA/policy/hip/intrinsics.hpp"
 #include "RAJA/policy/hip/policy.hpp"
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
+#include "RAJA/util/SoAArray.hpp"
+#include "RAJA/util/SoAPtr.hpp"
+#include "RAJA/util/basic_mempool.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/mutex.hpp"
+#include "RAJA/util/reduce.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -118,15 +116,19 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
-          typename T, typename TempIterator>
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T,
+          typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                        T identity,
-                                        TempIterator in_device_mem,
-                                        unsigned int* device_count)
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(
+      in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -141,7 +143,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   int maxNumSlots = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
   int beginSlots = replicationId * maxNumSlots;
@@ -163,8 +165,9 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
-    isLastBlock = (old_count == (numSlots-1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
+    isLastBlock = (old_count == (numSlots - 1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
@@ -175,9 +178,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId;
-                      i < numSlots;
-                      i += numThreads) {
+    for (unsigned int i = threadId; i < numSlots; i += numThreads) {
       Combiner{}(temp, device_mem.get(beginSlots + i));
     }
 
@@ -192,7 +193,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt {
+namespace expt
+{
 
 template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
@@ -226,18 +228,25 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     }
   }
 
-  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <= RAJA::policy::hip::device_constants.WARP_SIZE,
-               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
   if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE) {
 
     // Need to separate declaration and initialization for clang-hip
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T,
+                               RAJA::policy::hip::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>*
+        sd = reinterpret_cast<RAJA::detail::SoAArray<
+            T,
+            RAJA::policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0) {
@@ -255,7 +264,8 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS;
+           i *= 2) {
         T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
         temp = Combiner{}(temp, rhs);
       }
@@ -269,13 +279,16 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 
 template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
-                                          T val,
-                                          RAJA::detail::SoAPtr<T,RAJA::hip::device_mempool_type> device_mem,
-                                          unsigned int * device_count)
+RAJA_DEVICE RAJA_INLINE void grid_reduce(
+    T* device_target,
+    T val,
+    RAJA::detail::SoAPtr<T, RAJA::hip::device_mempool_type> device_mem,
+    unsigned int* device_count)
 {
-  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter =
+      typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter =
+      typename get_index_thread<GlobalIterationGetter>::type;
 
   const int numBlocks = BlockIterationGetter::size();
   const int numThreads = ThreadIterationGetter::size();
@@ -318,18 +331,21 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
   }
 }
 
-} //  namespace expt
+}  //  namespace expt
 
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
           typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
-                                               T identity,
-                                               T* device_mem,
-                                               unsigned int* device_count)
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(
+    T& val,
+    T identity,
+    T* device_mem,
+    unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -338,11 +354,11 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   if (numSlots <= 1u) {
     T temp = block_reduce<Combiner>(val, identity);
@@ -374,8 +390,9 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
     RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
-    isLastBlock = (old_count == (numSlots+1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
+    isLastBlock = (old_count == (numSlots + 1));
 
     // the last block for each replication gets the value from device_mem
     if (isLastBlock) {
@@ -390,8 +407,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
 RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
-                                                            T identity,
-                                                            T* device_mem)
+                                                          T identity,
+                                                          T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -399,8 +416,8 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
@@ -408,7 +425,6 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
   if (threadId == 0 && temp != identity) {
     RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
   }
-
 }
 
 }  // namespace impl
@@ -497,7 +513,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[num_slots] { return m_n->values; }
+    auto operator*() -> T (&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -534,7 +550,7 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Hip res) -> T(&)[num_slots]
+  auto new_value(::RAJA::resources::Hip res) -> T (&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
@@ -601,10 +617,12 @@ class PinnedTally
 
 //! Reduction data for Hip Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
-struct ReduceLastBlock_Data
-{
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
+struct ReduceLastBlock_Data {
   using tally_mempool_type = pinned_mempool_type;
   using data_mempool_type = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
@@ -647,7 +665,7 @@ struct ReduceLastBlock_Data
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
     for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
@@ -660,9 +678,9 @@ struct ReduceLastBlock_Data
   void grid_reduce(T* output)
   {
     T temp = value;
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
+    size_t replicationId = impl::
+        grid_reduce_last_block<Combiner, Accessor, replication, atomic_stride>(
+            temp, identity, device, device_count);
     if (replicationId != replication) {
       output[replicationId] = temp;
     }
@@ -677,9 +695,10 @@ struct ReduceLastBlock_Data
       hip_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots*replication);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device.allocate(maxNumSlots * replication);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -702,10 +721,11 @@ struct ReduceLastBlock_Data
 
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename T,
-          size_t replication, size_t atomic_stride>
-struct ReduceAtomicHostInit_Data
-{
+template <typename Combiner,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
+struct ReduceAtomicHostInit_Data {
   using tally_mempool_type = device_pinned_mempool_type;
 
   static constexpr size_t tally_slots = replication * atomic_stride;
@@ -715,7 +735,7 @@ struct ReduceAtomicHostInit_Data
   bool is_setup;
   bool own_device_ptr;
 
-  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){}
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {}
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
       : value{initValue},
@@ -734,11 +754,12 @@ struct ReduceAtomicHostInit_Data
   {
   }
 
-  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) =
+      default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
     for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
@@ -753,7 +774,7 @@ struct ReduceAtomicHostInit_Data
     T temp = value;
 
     impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
-            temp, identity, output);
+        temp, identity, output);
   }
 
   //! check and setup for device
@@ -782,10 +803,12 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
-struct ReduceAtomicDeviceInit_Data
-{
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
+struct ReduceAtomicDeviceInit_Data {
   using tally_mempool_type = pinned_mempool_type;
   using data_mempool_type = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
@@ -798,7 +821,7 @@ struct ReduceAtomicDeviceInit_Data
   T* device;
   bool own_device_ptr;
 
-  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){}
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {}
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
       : value{initValue},
@@ -819,11 +842,12 @@ struct ReduceAtomicDeviceInit_Data
   {
   }
 
-  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) =
+      default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
     for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
@@ -837,9 +861,11 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
+    size_t replicationId = impl::grid_reduce_atomic_device_init<Combiner,
+                                                                Accessor,
+                                                                replication,
+                                                                atomic_stride>(
+        temp, identity, device, device_count);
     if (replicationId != replication) {
       output[replicationId] = temp;
     }
@@ -851,9 +877,11 @@ struct ReduceAtomicDeviceInit_Data
   {
     bool act = !device && setupReducers();
     if (act) {
-      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device = data_mempool_type::getInstance().template malloc<T>(
+          replication * atomic_stride);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -880,41 +908,68 @@ struct ReduceAtomicDeviceInit_Data
 template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication = (tuning::replication > 0)
-      ? tuning::replication
-      : 32;
-  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
-      ? tuning::atomic_stride
-      : ((policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication =
+      (tuning::replication > 0) ? tuning::replication : 32;
+  static constexpr size_t atomic_stride =
+      (tuning::atomic_stride > 0)
+          ? tuning::atomic_stride
+          : ((policy::hip::device_constants
+                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+                 ? RAJA_DIVIDE_CEILING_INT(
+                       policy::hip::device_constants
+                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
+                       sizeof(T))
+                 : 1);
+
+  using Accessor = std::conditional_t<
+      (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseDeviceFence,
-        void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm ==
+       reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
-                                              (atomic_policy && !atomic_available),
-      hip::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
-      std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
-          hip::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
-            hip::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
-            void>>,
-        void>>;
+  using reduce_data_type = std::conditional_t<
+      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
+          (atomic_policy && !atomic_available),
+      hip::ReduceLastBlock_Data<Combiner,
+                                Accessor,
+                                T,
+                                replication,
+                                atomic_stride>,
+      std::conditional_t<
+          atomic_available,
+          std::conditional_t<
+              (tuning::algorithm ==
+               reduce_algorithm::init_device_combine_atomic_block),
+              hip::ReduceAtomicDeviceInit_Data<Combiner,
+                                               Accessor,
+                                               T,
+                                               replication,
+                                               atomic_stride>,
+              std::conditional_t<
+                  (tuning::algorithm ==
+                   reduce_algorithm::init_host_combine_atomic_block),
+                  hip::ReduceAtomicHostInit_Data<Combiner,
+                                                 T,
+                                                 replication,
+                                                 atomic_stride>,
+                  void>>,
+          void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
@@ -931,9 +986,7 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this},
-        tally_or_val_ptr{new TallyType},
-        val(init_val, identity_)
+      : parent{this}, tally_or_val_ptr{new TallyType}, val(init_val, identity_)
   {
   }
 
@@ -1132,9 +1185,10 @@ class ReduceMax<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 //! specialization of ReduceMinLoc for hip_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          tuning>
+    : public hip::Reduce<
+          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+          tuning>
 {
 
 public:
@@ -1145,20 +1199,26 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val, IndexType init_idx,
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
   {
   }
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1182,10 +1242,11 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
 //! specialization of ReduceMaxLoc for hip_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::
-          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 tuning>
+    : public hip::Reduce<
+          RAJA::reduce::max<
+              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+          tuning>
 {
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
@@ -1195,20 +1256,26 @@ class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val, IndexType init_idx,
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
   {
   }
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index cdf0a9b82d..d9f13ebbee 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -53,11 +53,10 @@ template <typename IterationMapping,
           bool Async,
           typename InputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-inclusive_inplace(
+RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -127,11 +126,10 @@ template <typename IterationMapping,
           typename InputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-exclusive_inplace(
+RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -169,13 +167,13 @@ exclusive_inplace(
   // Run
 #if defined(__HIPCC__)
   hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              init,
-                                              len,
-                                              binary_op,
-                                              stream));
+                                      temp_storage_bytes,
+                                      begin,
+                                      begin,
+                                      init,
+                                      len,
+                                      binary_op,
+                                      stream));
 #elif defined(__CUDACC__)
   hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
                                              temp_storage_bytes,
@@ -205,11 +203,10 @@ template <typename IterationMapping,
           typename InputIter,
           typename OutputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-inclusive(
+RAJA_INLINE resources::EventProxy<resources::Hip> inclusive(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -222,21 +219,11 @@ inclusive(
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, out, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -244,21 +231,11 @@ inclusive(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, out, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -280,11 +257,10 @@ template <typename IterationMapping,
           typename OutputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-exclusive(
+RAJA_INLINE resources::EventProxy<resources::Hip> exclusive(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index eb16246623..1517c6198a 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -28,17 +28,17 @@
 
 #if defined(__HIPCC__)
 #define ROCPRIM_HIP_API 1
-#include "rocprim/device/device_transform.hpp"
 #include "rocprim/device/device_radix_sort.hpp"
+#include "rocprim/device/device_transform.hpp"
 #elif defined(__CUDACC__)
 #include "cub/device/device_radix_sort.cuh"
 #endif
 
-#include "RAJA/util/concepts.hpp"
-#include "RAJA/util/Operators.hpp"
 #include "RAJA/pattern/detail/algorithm.hpp"
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #include "RAJA/policy/hip/policy.hpp"
+#include "RAJA/util/Operators.hpp"
+#include "RAJA/util/concepts.hpp"
 
 namespace RAJA
 {
@@ -51,52 +51,64 @@ namespace detail
 {
 
 #if defined(__HIPCC__)
-  template < typename R >
-  using double_buffer = ::rocprim::double_buffer<R>;
+template <typename R>
+using double_buffer = ::rocprim::double_buffer<R>;
 #elif defined(__CUDACC__)
-  template < typename R >
-  using double_buffer = ::cub::DoubleBuffer<R>;
+template <typename R>
+using double_buffer = ::cub::DoubleBuffer<R>;
 #endif
 
-  template < typename R >
-  R* get_current(double_buffer<R>& d_bufs)
-  {
+template <typename R>
+R* get_current(double_buffer<R>& d_bufs)
+{
 #if defined(__HIPCC__)
-    return d_bufs.current();
+  return d_bufs.current();
 #elif defined(__CUDACC__)
-    return d_bufs.Current();
+  return d_bufs.Current();
 #endif
-  }
-
 }
 
+}  // namespace detail
+
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter,
+       Iter,
+       Compare)
 {
-  static_assert(concepts::all_of<
-                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                  std::is_pointer<Iter>,
-                  concepts::any_of<
-                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
-                "RAJA stable_sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+  static_assert(
+      concepts::all_of<
+          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+          std::is_pointer<Iter>,
+          concepts::any_of<
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare,
+                            operators::greater<RAJA::detail::IterVal<Iter>>>>>::
+          value,
+      "RAJA stable_sort<hip_exec> is only implemented for pointers to "
+      "arithmetic types and RAJA::operators::less and "
+      "RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -104,26 +116,28 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
   int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -181,7 +195,8 @@ stable(
   if (detail::get_current(d_keys) == d_out) {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        begin, d_out, len * sizeof(R), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -194,26 +209,28 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
   int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -271,7 +288,8 @@ stable(
   if (detail::get_current(d_keys) == d_out) {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        begin, d_out, len * sizeof(R), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -285,30 +303,41 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+         Iter,
+         Iter,
+         Compare)
 {
-  static_assert(concepts::all_of<
-                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                  std::is_pointer<Iter>,
-                  concepts::any_of<
-                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
-                "RAJA sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+  static_assert(
+      concepts::all_of<
+          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+          std::is_pointer<Iter>,
+          concepts::any_of<
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare,
+                            operators::greater<RAJA::detail::IterVal<Iter>>>>>::
+          value,
+      "RAJA sort<hip_exec> is only implemented for pointers to arithmetic "
+      "types and RAJA::operators::less and RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -316,18 +345,20 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -335,18 +366,20 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -355,36 +388,47 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for "
+                "arithmetic types");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for "
+      "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -392,16 +436,21 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -413,8 +462,8 @@ stable_pairs(
   using V = RAJA::detail::IterVal<ValIter>;
 
   int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -478,12 +527,14 @@ stable_pairs(
   if (detail::get_current(d_keys) == d_keys_out) {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), hipMemcpyDefault, stream));
   }
   if (detail::get_current(d_vals) == d_vals_out) {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -497,16 +548,21 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -518,8 +574,8 @@ stable_pairs(
   using V = RAJA::detail::IterVal<ValIter>;
 
   int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -583,12 +639,14 @@ stable_pairs(
   if (detail::get_current(d_keys) == d_keys_out) {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), hipMemcpyDefault, stream));
   }
   if (detail::get_current(d_vals) == d_vals_out) {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -603,36 +661,47 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
-      "sort_pairs<hip_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "sort_pairs<hip_exec> is only implemented for arithmetic "
+                "types");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or "
+      "RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -640,16 +709,21 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -661,16 +735,21 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index fc29dabcbf..bbd07a6d00 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -26,24 +26,25 @@
 #if defined(RAJA_ENABLE_OPENMP)
 
 #include <omp.h>
+
 #include <iostream>
 #include <thread>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/openmp/atomic.hpp"
+#include "RAJA/policy/openmp/atomic.hpp"
 #endif
 
+#include "RAJA/policy/openmp/WorkGroup.hpp"
 #include "RAJA/policy/openmp/forall.hpp"
 #include "RAJA/policy/openmp/kernel.hpp"
+#include "RAJA/policy/openmp/launch.hpp"
+#include "RAJA/policy/openmp/multi_reduce.hpp"
 #include "RAJA/policy/openmp/policy.hpp"
 #include "RAJA/policy/openmp/reduce.hpp"
-#include "RAJA/policy/openmp/multi_reduce.hpp"
 #include "RAJA/policy/openmp/region.hpp"
 #include "RAJA/policy/openmp/scan.hpp"
 #include "RAJA/policy/openmp/sort.hpp"
 #include "RAJA/policy/openmp/synchronize.hpp"
-#include "RAJA/policy/openmp/launch.hpp"
-#include "RAJA/policy/openmp/WorkGroup.hpp"
 
 
 #endif  // closing endif for if defined(RAJA_ENABLE_OPENMP)
diff --git a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
index 09861941ab..34653c67da 100644
--- a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
@@ -19,9 +19,7 @@
 #define RAJA_openmp_WorkGroup_Dispatcher_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/policy/openmp/policy.hpp"
-
 #include "RAJA/policy/sequential/WorkGroup/Dispatcher.hpp"
 
 
@@ -32,9 +30,9 @@ namespace detail
 {
 
 /*!
-* Populate and return a Dispatcher object
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_work const&)
 {
   return get_Dispatcher<T, Dispatcher_T>(seq_work{});
diff --git a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
index c889273a0f..8f30e1f1e2 100644
--- a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
@@ -19,10 +19,8 @@
 #define RAJA_openmp_WorkGroup_WorkRunner_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/policy/openmp/policy.hpp"
-
 #include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+#include "RAJA/policy/openmp/policy.hpp"
 
 
 namespace RAJA
@@ -38,23 +36,21 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::omp_parallel_for_exec,
-        RAJA::omp_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...> {
+};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,23 +59,21 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::omp_parallel_for_exec,
-        RAJA::omp_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...> {
+};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp
index 2dc047dd95..cbf0f951c8 100644
--- a/include/RAJA/policy/openmp/atomic.hpp
+++ b/include/RAJA/policy/openmp/atomic.hpp
@@ -23,7 +23,6 @@
 #if defined(RAJA_ENABLE_OPENMP)
 
 #include "RAJA/policy/openmp/policy.hpp"
-
 #include "RAJA/util/macros.hpp"
 
 
@@ -35,8 +34,7 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
 {
   T ret;
 #pragma omp atomic capture
@@ -49,8 +47,7 @@ RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
 {
   T ret;
 #pragma omp atomic capture
@@ -63,8 +60,7 @@ RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -78,8 +74,7 @@ RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -93,16 +88,14 @@ RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-  #pragma omp atomic capture compare
+#pragma omp atomic capture compare
   {
     old = *acc;
-    if ( value < *acc )
-    {
+    if (value < *acc) {
       *acc = value;
     }
   }
@@ -115,16 +108,14 @@ RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-  #pragma omp atomic capture compare
+#pragma omp atomic capture compare
   {
     old = *acc;
-    if ( value > *acc )
-    {
+    if (value > *acc) {
       *acc = value;
     }
   }
@@ -138,8 +129,7 @@ RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T *acc)
 {
   T old;
 #pragma omp atomic capture
@@ -153,8 +143,7 @@ RAJA_INLINE T atomicInc(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T *acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
   return RAJA::atomicInc(builtin_atomic{}, acc, value);
@@ -163,8 +152,7 @@ RAJA_INLINE T atomicInc(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T *acc)
 {
   T old;
 #pragma omp atomic capture
@@ -178,8 +166,7 @@ RAJA_INLINE T atomicDec(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T *acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
   return RAJA::atomicDec(builtin_atomic{}, acc, value);
@@ -187,8 +174,7 @@ RAJA_INLINE T atomicDec(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -201,8 +187,7 @@ RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -215,8 +200,7 @@ RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -229,8 +213,7 @@ RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -243,14 +226,13 @@ RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(omp_atomic, T *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(omp_atomic, T *acc, T compare, T value)
 {
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
   return RAJA::atomicCAS(builtin_atomic{}, acc, compare, value);
 }
 
-#endif // not defined RAJA_COMPILER_MSVC
+#endif  // not defined RAJA_COMPILER_MSVC
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index 815168ae98..a3bcdf2a66 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -25,27 +25,21 @@
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-#include <iostream>
-#include <type_traits>
-
 #include <omp.h>
 
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/internal/fault_tolerance.hpp"
+#include <iostream>
+#include <type_traits>
 
 #include "RAJA/index/IndexSet.hpp"
 #include "RAJA/index/ListSegment.hpp"
 #include "RAJA/index/RangeSegment.hpp"
-
-#include "RAJA/policy/openmp/policy.hpp"
-
+#include "RAJA/internal/fault_tolerance.hpp"
 #include "RAJA/pattern/forall.hpp"
-#include "RAJA/pattern/region.hpp"
-
 #include "RAJA/pattern/params/forall.hpp"
-
+#include "RAJA/pattern/region.hpp"
 #include "RAJA/policy/openmp/params/forall.hpp"
+#include "RAJA/policy/openmp/policy.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -55,12 +49,14 @@ namespace policy
 namespace omp
 {
 
-template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Iterable,
+          typename Func,
+          typename InnerPolicy,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
@@ -83,249 +79,277 @@ forall_impl(resources::Host host_res,
 namespace internal
 {
 
-  /// Tag dispatch for omp forall
-
-  //
-  // omp for (Auto)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Auto&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
-  }
+/// Tag dispatch for omp forall
 
-  //
-  // omp for schedule(static)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+//
+// omp for (Auto)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Auto&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+//
+// omp for schedule(static)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(dynamic)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(dynamic)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+//
+// omp for schedule(static, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(dynamic, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(dynamic, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+//
+// omp for schedule(dynamic)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(dynamic)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(guided)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(guided)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+//
+// omp for schedule(dynamic, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(dynamic, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(guided, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(guided, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+//
+// omp for schedule(guided)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(guided)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(runtime)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(runtime)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+//
+// omp for schedule(guided, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(guided, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
+}
 
-  // TODO :: not implemented in forall param interface ...
-  #if !defined(RAJA_COMPILER_MSVC)
-  // dynamic & guided
-  template <typename Policy, typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const Policy&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    omp_sched_t prev_sched;
-    int prev_chunk;
-    omp_get_schedule(&prev_sched, &prev_chunk);
-    omp_set_schedule(Policy::schedule, Policy::chunk_size);
-    forall_impl(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
-    omp_set_schedule(prev_sched, prev_chunk);
-  }
-  #endif
-
-
-  /// Tag dispatch for omp forall with nowait
-
-  //
-  // omp for nowait (Auto)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+//
+// omp for schedule(runtime)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(runtime)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static) nowait
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static) nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
-  }
+// TODO :: not implemented in forall param interface ...
+#if !defined(RAJA_COMPILER_MSVC)
+// dynamic & guided
+template <typename Policy, typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
+{
+  omp_sched_t prev_sched;
+  int prev_chunk;
+  omp_get_schedule(&prev_sched, &prev_chunk);
+  omp_set_schedule(Policy::schedule, Policy::chunk_size);
+  forall_impl(::RAJA::policy::omp::Runtime{},
+              std::forward<Iterable>(iter),
+              std::forward<Func>(loop_body));
+  omp_set_schedule(prev_sched, prev_chunk);
+}
+#endif
 
-  //
-  // omp for schedule(static, ChunkSize) nowait
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static, ChunkSize) nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+
+/// Tag dispatch for omp forall with nowait
+
+//
+// omp for nowait (Auto)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
+                                    Iterable&& iter,
+                                    Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
+}
 
-  //TODO :: not implemented in param interface...
-  #if !defined(RAJA_COMPILER_MSVC)
-  // dynamic & guided
-  template <typename Policy, typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl_nowait(const Policy&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    omp_sched_t prev_sched;
-    int prev_chunk;
-    omp_get_schedule(&prev_sched, &prev_chunk);
-    omp_set_schedule(Policy::schedule, Policy::chunk_size);
-    forall_impl_nowait(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
-    omp_set_schedule(prev_sched, prev_chunk);
+//
+// omp for schedule(static) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>&,
+    Iterable&& iter,
+    Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static) nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
-  #endif
+}
 
-} // end namespace internal
+//
+// omp for schedule(static, ChunkSize) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>&,
+    Iterable&& iter,
+    Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static, ChunkSize) nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
+  }
+}
 
-template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+// TODO :: not implemented in param interface...
+#if !defined(RAJA_COMPILER_MSVC)
+// dynamic & guided
+template <typename Policy, typename Iterable, typename Func>
+RAJA_INLINE void forall_impl_nowait(const Policy&,
+                                    Iterable&& iter,
+                                    Func&& loop_body)
+{
+  omp_sched_t prev_sched;
+  int prev_chunk;
+  omp_get_schedule(&prev_sched, &prev_chunk);
+  omp_set_schedule(Policy::schedule, Policy::chunk_size);
+  forall_impl_nowait(::RAJA::policy::omp::Runtime{},
+                     std::forward<Iterable>(iter),
+                     std::forward<Func>(loop_body));
+  omp_set_schedule(prev_sched, prev_chunk);
+}
+#endif
+
+}  // end namespace internal
+
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl(Schedule{},
+                        std::forward<Iterable>(iter),
+                        std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_nowait_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl_nowait(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl_nowait(Schedule{},
+                               std::forward<Iterable>(iter),
+                               std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
diff --git a/include/RAJA/policy/openmp/kernel/Collapse.hpp b/include/RAJA/policy/openmp/kernel/Collapse.hpp
index ba71ac2fbf..ff345f83de 100644
--- a/include/RAJA/policy/openmp/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp/kernel/Collapse.hpp
@@ -23,15 +23,12 @@
 #if defined(RAJA_ENABLE_OPENMP)
 
 #include "RAJA/pattern/detail/privatizer.hpp"
-
 #include "RAJA/pattern/kernel/Collapse.hpp"
 #include "RAJA/pattern/kernel/internal.hpp"
-
+#include "RAJA/policy/openmp/policy.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
 
-#include "RAJA/policy/openmp/policy.hpp"
-
 namespace RAJA
 {
 
@@ -48,10 +45,14 @@ namespace internal
 // Collapsing two loops
 /////////
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>, Types> {
+                                             EnclosedStmts...>,
+                         Types> {
 
 
   template <typename Data>
@@ -78,7 +79,8 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
         auto& private_data = privatizer.get_priv();
         private_data.template assign_offset<Arg0>(i0);
         private_data.template assign_offset<Arg1>(i1);
-        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
+            private_data);
       }
     }
   }
@@ -92,7 +94,8 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>, Types> {
+                                             EnclosedStmts...>,
+                         Types> {
 
 
   template <typename Data>
@@ -121,7 +124,8 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
           private_data.template assign_offset<Arg0>(i0);
           private_data.template assign_offset<Arg1>(i1);
           private_data.template assign_offset<Arg2>(i2);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
+              private_data);
         }
       }
     }
@@ -129,9 +133,6 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 };
 
 
-
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
index 65f56010bc..55fc1e7076 100644
--- a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
+++ b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
@@ -23,13 +23,10 @@
 #if defined(RAJA_ENABLE_OPENMP)
 
 #include "RAJA/pattern/kernel/internal.hpp"
-
+#include "RAJA/policy/openmp/policy.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
 
-#include "RAJA/policy/openmp/policy.hpp"
-
-
 
 namespace RAJA
 {
@@ -39,29 +36,24 @@ namespace statement
 struct OmpSyncThreads : public internal::Statement<camp::nil> {
 };
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
 
 
-
-//Statement executor to synchronize omp threads inside a kernel region
-template<typename Types>
+// Statement executor to synchronize omp threads inside a kernel region
+template <typename Types>
 struct StatementExecutor<statement::OmpSyncThreads, Types> {
 
-template<typename Data>
-static RAJA_INLINE void exec(Data &&)
-{
-  #pragma omp barrier
-}
-
+  template <typename Data>
+  static RAJA_INLINE void exec(Data &&)
+  {
+#pragma omp barrier
+  }
 };
 
 
-
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 7856bd6fda..75fa595b43 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -28,45 +28,54 @@ template <>
 struct LaunchExecute<RAJA::omp_launch_t> {
 
   template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *, BODY const &body, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const &params,
+       const char *,
+       BODY const &body,
+       ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
     RAJA::region<RAJA::omp_parallel_region>([&]() {
+      LaunchContext ctx;
 
-        LaunchContext ctx;
-
-        using RAJA::internal::thread_privatize;
-        auto loop_body = thread_privatize(body);
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 
-        ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
+      ctx.shared_mem_ptr = (char *)malloc(params.shared_mem_size);
 
-        loop_body.get_priv()(ctx);
+      loop_body.get_priv()(ctx);
 
-        free(ctx.shared_mem_ptr);
-        ctx.shared_mem_ptr = nullptr;
+      free(ctx.shared_mem_ptr);
+      ctx.shared_mem_ptr = nullptr;
     });
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename ReduceParams, typename BODY>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *RAJA_UNUSED_ARG(kernel_name),  BODY const &body, ReduceParams &f_params)
+  template <typename ReduceParams, typename BODY>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const &launch_params,
+       const char *RAJA_UNUSED_ARG(kernel_name),
+       BODY const &body,
+       ReduceParams &f_params)
   {
 
     using EXEC_POL = RAJA::omp_launch_t;
 
     expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
-    //reducer object must be named f_params as expected by macro below
+    // reducer object must be named f_params as expected by macro below
     RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-   #pragma omp parallel reduction(combine : f_params)
+#pragma omp parallel reduction(combine : f_params)
     {
 
       LaunchContext ctx;
@@ -74,7 +83,7 @@ struct LaunchExecute<RAJA::omp_launch_t> {
       using RAJA::internal::thread_privatize;
       auto loop_body = thread_privatize(body);
 
-      ctx.shared_mem_ptr = (char*) malloc(launch_params.shared_mem_size);
+      ctx.shared_mem_ptr = (char *)malloc(launch_params.shared_mem_size);
 
       expt::invoke_body(f_params, loop_body.get_priv(), ctx);
 
@@ -86,7 +95,6 @@ struct LaunchExecute<RAJA::omp_launch_t> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
@@ -248,9 +256,9 @@ struct LoopICountExecute<omp_for_exec, SEGMENT> {
     int len = segment.end() - segment.begin();
 
 #pragma omp for
-      for (int i = 0; i < len; i++) {
-        body(*(segment.begin() + i), i);
-      }
+    for (int i = 0; i < len; i++) {
+      body(*(segment.begin() + i), i);
+    }
   }
 
   template <typename BODY>
@@ -265,15 +273,12 @@ struct LoopICountExecute<omp_for_exec, SEGMENT> {
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++) {
+      for (int i = 0; i < len0; i++) {
 
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
-               i,
-               j);
-        }
+        body(*(segment0.begin() + i), *(segment1.begin() + j), i, j);
       }
+    }
   }
 
   template <typename BODY>
@@ -290,18 +295,18 @@ struct LoopICountExecute<omp_for_exec, SEGMENT> {
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
-            body(*(segment0.begin() + i),
-                 *(segment1.begin() + j),
-                 *(segment2.begin() + k),
-                 i,
-                 j,
-                 k);
-          }
+    for (int k = 0; k < len2; k++) {
+      for (int j = 0; j < len1; j++) {
+        for (int i = 0; i < len0; i++) {
+          body(*(segment0.begin() + i),
+               *(segment1.begin() + j),
+               *(segment2.begin() + k),
+               i,
+               j,
+               k);
         }
       }
+    }
   }
 };
 
diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp
index 22b09a7722..907cc81e02 100644
--- a/include/RAJA/policy/openmp/multi_reduce.hpp
+++ b/include/RAJA/policy/openmp/multi_reduce.hpp
@@ -25,21 +25,18 @@
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-#include <memory>
-#include <vector>
-
 #include <omp.h>
 
-#include "RAJA/util/types.hpp"
-#include "RAJA/util/reduce.hpp"
-#include "RAJA/util/RepeatView.hpp"
+#include <memory>
+#include <vector>
 
 #include "RAJA/internal/MemUtils_CPU.hpp"
-
 #include "RAJA/pattern/detail/multi_reduce.hpp"
 #include "RAJA/pattern/multi_reduce.hpp"
-
 #include "RAJA/policy/openmp/policy.hpp"
+#include "RAJA/util/RepeatView.hpp"
+#include "RAJA/util/reduce.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -56,7 +53,7 @@ namespace detail
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataOMP;
 
 /*!
@@ -68,38 +65,43 @@ struct MultiReduceDataOMP;
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataOMP<T, t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
-{
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataOMP<
+    T,
+    t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<
+        RAJA::omp::multi_reduce_algorithm::combine_on_destruction>> {
   using value_type = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_num_bins(container.size())
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_num_bins(container.size()),
+        m_identity(identity),
+        m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_identity(other.m_identity)
-      , m_data(nullptr)
+  MultiReduceDataOMP(MultiReduceDataOMP const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_identity(other.m_identity),
+        m_data(nullptr)
   {
-    m_data = create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins), other.m_num_bins);
+    m_data =
+        create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins),
+                    other.m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&) = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&) = delete;
 
   ~MultiReduceDataOMP()
   {
@@ -116,7 +118,7 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     m_identity = identity;
@@ -138,26 +140,27 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp{}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataOMP const *m_parent;
+  MultiReduceDataOMP const* m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template < typename Container >
+  template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
   {
     if (num_bins == size_t(0)) {
       return nullptr;
     }
-    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, num_bins * sizeof(T) );
+    auto data =
+        RAJA::allocate_aligned_type<T>(RAJA::DATA_ALIGN, num_bins * sizeof(T));
     size_t bin = 0;
     for (auto const& value : container) {
-      new(&data[bin]) T(value);
+      new (&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -169,7 +172,7 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
       return;
     }
     for (size_t bin = num_bins; bin > 0; --bin) {
-      data[bin-1].~T();
+      data[bin - 1].~T();
     }
     RAJA::free_aligned(data);
     data = nullptr;
@@ -185,74 +188,92 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataOMP<T, t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>>
-{
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataOMP<
+    T,
+    t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<
+        RAJA::omp::multi_reduce_algorithm::combine_on_get>> {
   using value_type = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_max_threads(omp_get_max_threads())
-      , m_num_bins(container.size())
-      , m_padded_threads(pad_threads(m_max_threads))
-      , m_padded_bins(pad_bins(m_num_bins))
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_max_threads(omp_get_max_threads()),
+        m_num_bins(container.size()),
+        m_padded_threads(pad_threads(m_max_threads)),
+        m_padded_bins(pad_bins(m_num_bins)),
+        m_identity(identity),
+        m_data(nullptr)
   {
-    m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    m_data = create_data(container,
+                         identity,
+                         m_num_bins,
+                         m_max_threads,
+                         m_padded_bins,
+                         m_padded_threads);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_padded_threads(other.m_padded_threads)
-      , m_padded_bins(other.m_padded_bins)
-      , m_identity(other.m_identity)
-      , m_data(other.m_data)
-  { }
+  MultiReduceDataOMP(MultiReduceDataOMP const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_padded_threads(other.m_padded_threads),
+        m_padded_bins(other.m_padded_bins),
+        m_identity(other.m_identity),
+        m_data(other.m_data)
+  {
+  }
 
-  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&) = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&) = delete;
 
   ~MultiReduceDataOMP()
   {
     if (m_data) {
       if (!m_parent) {
-        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+        destroy_data(
+            m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
       }
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     m_identity = identity;
     size_t new_num_bins = container.size();
     if (new_num_bins != m_num_bins) {
-      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+      destroy_data(
+          m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
       m_num_bins = new_num_bins;
       m_padded_bins = pad_bins(m_num_bins);
-      m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+      m_data = create_data(container,
+                           identity,
+                           m_num_bins,
+                           m_max_threads,
+                           m_padded_bins,
+                           m_padded_threads);
     } else {
       if (m_max_threads > 0) {
         {
           size_t thread_idx = 0;
           size_t bin = 0;
           for (auto const& value : container) {
-            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
+            m_data[index_data(
+                bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
             ++bin;
           }
         }
         for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx) {
           for (size_t bin = 0; bin < m_num_bins; ++bin) {
-            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
+            m_data[index_data(
+                bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
           }
         }
       }
@@ -263,10 +284,12 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val)
+  void combine(size_t bin, T const& val)
   {
     size_t thread_idx = omp_get_thread_num();
-    MultiReduceOp{}(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)], val);
+    MultiReduceOp{}(
+        m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)],
+        val);
   }
 
   T get(size_t bin) const
@@ -274,13 +297,14 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
     ::RAJA::detail::HighAccuracyReduce<T, typename MultiReduceOp::operator_type>
         reducer(m_identity);
     for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx) {
-      reducer.combine(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
+      reducer.combine(
+          m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
     }
     return reducer.get_and_clear();
   }
 
 private:
-  MultiReduceDataOMP const *m_parent;
+  MultiReduceDataOMP const* m_parent;
   size_t m_max_threads;
   size_t m_num_bins;
   size_t m_padded_threads;
@@ -290,8 +314,10 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   static constexpr size_t pad_bins(size_t num_bins)
   {
-    size_t num_cache_lines = RAJA_DIVIDE_CEILING_INT(num_bins*sizeof(T), RAJA::DATA_ALIGN);
-    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN, sizeof(T));
+    size_t num_cache_lines =
+        RAJA_DIVIDE_CEILING_INT(num_bins * sizeof(T), RAJA::DATA_ALIGN);
+    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN,
+                                   sizeof(T));
   }
 
   static constexpr size_t pad_threads(size_t max_threads)
@@ -299,33 +325,42 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
     return max_threads;
   }
 
-  static constexpr size_t index_data(size_t bin, size_t thread_idx,
-                                     size_t padded_bins, size_t RAJA_UNUSED_ARG(padded_threads))
+  static constexpr size_t index_data(size_t bin,
+                                     size_t thread_idx,
+                                     size_t padded_bins,
+                                     size_t RAJA_UNUSED_ARG(padded_threads))
   {
     return bin + thread_idx * padded_bins;
   }
 
-  template < typename Container >
-  static T* create_data(Container const& container, T identity,
-                        size_t num_bins, size_t max_threads,
-                        size_t padded_bins, size_t padded_threads)
+  template <typename Container>
+  static T* create_data(Container const& container,
+                        T identity,
+                        size_t num_bins,
+                        size_t max_threads,
+                        size_t padded_bins,
+                        size_t padded_threads)
   {
     if (num_bins == size_t(0)) {
       return nullptr;
     }
-    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, padded_threads*padded_bins*sizeof(T) );
+    auto data = RAJA::allocate_aligned_type<T>(RAJA::DATA_ALIGN,
+                                               padded_threads * padded_bins *
+                                                   sizeof(T));
     if (max_threads > 0) {
       {
         size_t thread_idx = 0;
         size_t bin = 0;
         for (auto const& value : container) {
-          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(value);
+          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
+              T(value);
           ++bin;
         }
       }
       for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx) {
         for (size_t bin = 0; bin < num_bins; ++bin) {
-          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(identity);
+          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
+              T(identity);
         }
       }
     }
@@ -333,15 +368,18 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
   }
 
   static void destroy_data(T*& data,
-                           size_t num_bins, size_t max_threads,
-                           size_t padded_bins, size_t padded_threads)
+                           size_t num_bins,
+                           size_t max_threads,
+                           size_t padded_bins,
+                           size_t padded_threads)
   {
     if (num_bins == size_t(0)) {
       return;
     }
     for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx) {
       for (size_t bin = num_bins; bin > 0; --bin) {
-        data[index_data(bin-1, thread_idx-1, padded_bins, padded_threads)].~T();
+        data[index_data(bin - 1, thread_idx - 1, padded_bins, padded_threads)]
+            .~T();
       }
     }
     RAJA::free_aligned(data);
@@ -351,7 +389,8 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy, detail::MultiReduceDataOMP)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy,
+                                detail::MultiReduceDataOMP)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index d9bea5d0d8..22d4b16902 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -18,296 +18,339 @@ namespace omp
 namespace expt
 {
 
-  namespace internal
-  {
-    //
-    // omp for (Auto)
-    //
-    template <typename ExecPol, typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol, RAJA::policy::omp::Auto> >
-    forall_impl(const ExecPol& p,
-                Iterable&& iter,
-                Func&& loop_body,
-                ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+namespace internal
+{
+//
+// omp for (Auto)
+//
+template <typename ExecPol,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<std::is_same<ExecPol, RAJA::policy::omp::Auto>>
+forall_impl(const ExecPol& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(static)
-    //
-    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-                         std::integral_constant<bool,(ChunkSize <= 0)> >
-    forall_impl(const ExecPol<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(static) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for schedule(static, ChunkSize)
-    //
-    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-                         std::integral_constant<bool,(ChunkSize > 0)> >
-    forall_impl(const ExecPol<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(static, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for schedule(static)
+//
+template <template <int> class ExecPol,
+          typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<
+    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+    std::integral_constant<bool, (ChunkSize <= 0)>>
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(static) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(runtime)
-    //
-    template <typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(runtime) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for nowait (Auto)
-    //
-    template <typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel
-      {
-      #pragma omp for nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for schedule(static, ChunkSize)
+//
+template <template <int> class ExecPol,
+          typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<
+    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+    std::integral_constant<bool, (ChunkSize > 0)>>
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(static, ChunkSize) reduction(combine \
+                                                               : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(dynamic)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for schedule(dynamic, ChunkSize)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for schedule(runtime)
+//
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(runtime) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(guided)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(guided) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for schedule(guided, ChunkSize)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(guided, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for nowait (Auto)
+//
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
+                                    Iterable&& iter,
+                                    Func&& loop_body,
+                                    ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-    //
-    // omp for schedule(static) nowait
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for schedule(static) nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+  {
+#pragma omp for nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
     }
+  }
+
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
+
+//
+// omp for schedule(dynamic)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
+
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
+
+//
+// omp for schedule(dynamic, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine \
+                                                                : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
+
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
+
+//
+// omp for schedule(guided)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(guided) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
+
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
+
+//
+// omp for schedule(guided, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(guided, ChunkSize) reduction(combine \
+                                                               : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
+
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
+
+//
+// omp for schedule(static) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>& p,
+    Iterable&& iter,
+    Func&& loop_body,
+    ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-    //
-    // omp for schedule(static, ChunkSize) nowait
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+  {
+#pragma omp for schedule(static) nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
     }
+  }
 
-  } //  namespace internal
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-  template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-  RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
-                                                                 const omp_for_schedule_exec<Schedule>&,
-                                                                 Iterable&& iter,
-                                                                 Func&& loop_body,
-                                                                 ForallParam f_params)
+//
+// omp for schedule(static, ChunkSize) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>& p,
+    Iterable&& iter,
+    Func&& loop_body,
+    ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel
   {
-    expt::internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body), std::forward<ForallParam>(f_params));
-    return resources::EventProxy<resources::Host>(host_res);
+#pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+    }
   }
-} //  namespace expt
+
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
+
+}  //  namespace internal
+
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(
+    resources::Host host_res,
+    const omp_for_schedule_exec<Schedule>&,
+    Iterable&& iter,
+    Func&& loop_body,
+    ForallParam f_params)
+{
+  expt::internal::forall_impl(Schedule{},
+                              std::forward<Iterable>(iter),
+                              std::forward<Func>(loop_body),
+                              std::forward<ForallParam>(f_params));
+  return resources::EventProxy<resources::Host>(host_res);
+}
+}  //  namespace expt
 
 ///
 /// OpenMP parallel policy implementation
 ///
-template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+template <typename Iterable,
+          typename Func,
+          typename InnerPolicy,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
diff --git a/include/RAJA/policy/openmp/params/kernel_name.hpp b/include/RAJA/policy/openmp/params/kernel_name.hpp
index 65a5f7a329..ee496edb24 100644
--- a/include/RAJA/policy/openmp/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp/params/kernel_name.hpp
@@ -3,38 +3,44 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  combine(KernelName&, T& /*place holder argument*/) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL> > init(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL> > combine(
+    KernelName&,
+    T& /*place holder argument*/)
+{
+}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL> > resolve(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/params/reduce.hpp b/include/RAJA/policy/openmp/params/reduce.hpp
index f71efc255a..38c717b289 100644
--- a/include/RAJA/policy/openmp/params/reduce.hpp
+++ b/include/RAJA/policy/openmp/params/reduce.hpp
@@ -3,37 +3,44 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL> > init(
+    Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL> > combine(
+    Reducer<OP, T, VOp>& out,
+    const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL> > resolve(
+    Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index aff2567474..40cb894e57 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -26,15 +26,15 @@
 #include "RAJA/policy/atomic_builtin.hpp"
 
 #if defined(RAJA_COMPILER_MSVC)
-typedef enum omp_sched_t { 
-    // schedule kinds 
-    omp_sched_static = 0x1, 
-    omp_sched_dynamic = 0x2, 
-    omp_sched_guided = 0x3, 
-    omp_sched_auto = 0x4, 
-    
-    // schedule modifier 
-    omp_sched_monotonic = 0x80000000u 
+typedef enum omp_sched_t {
+  // schedule kinds
+  omp_sched_static = 0x1,
+  omp_sched_dynamic = 0x2,
+  omp_sched_guided = 0x3,
+  omp_sched_auto = 0x4,
+
+  // schedule modifier
+  omp_sched_monotonic = 0x80000000u
 } omp_sched_t;
 #else
 #include <omp.h>
@@ -45,21 +45,19 @@ namespace RAJA
 namespace omp
 {
 
-enum struct multi_reduce_algorithm : int
-{
+enum struct multi_reduce_algorithm : int {
   combine_on_destruction,
   combine_on_get
 };
 
-template < multi_reduce_algorithm t_algorithm >
-struct MultiReduceTuning
-{
+template <multi_reduce_algorithm t_algorithm>
+struct MultiReduceTuning {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   static constexpr bool consistent =
       (algorithm == multi_reduce_algorithm::combine_on_get);
 };
 
-} // namspace omp
+}  // namespace omp
 
 namespace policy
 {
@@ -68,14 +66,15 @@ namespace omp
 
 namespace internal
 {
-    struct ScheduleTag {};
-
-    template <omp_sched_t Sched, int Chunk>
-    struct Schedule : public ScheduleTag {
-        constexpr static omp_sched_t schedule = Sched;
-        constexpr static int chunk_size = Chunk;
-        constexpr static Policy policy = Policy::openmp;
-    };
+struct ScheduleTag {
+};
+
+template <omp_sched_t Sched, int Chunk>
+struct Schedule : public ScheduleTag {
+  constexpr static omp_sched_t schedule = Sched;
+  constexpr static int chunk_size = Chunk;
+  constexpr static Policy policy = Policy::openmp;
+};
 }  // namespace internal
 
 //
@@ -97,7 +96,7 @@ struct NoWait {
 
 static constexpr int default_chunk_size = -1;
 
-struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>{
+struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size> {
 };
 
 template <int ChunkSize = default_chunk_size>
@@ -110,7 +109,8 @@ using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
 template <int ChunkSize = default_chunk_size>
 using Guided = internal::Schedule<omp_sched_guided, ChunkSize>;
 
-struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), default_chunk_size> {
+struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1),
+                                            default_chunk_size> {
 };
 
 //
@@ -122,7 +122,7 @@ struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), defaul
 //
 
 ///
-///  Struct supporting OpenMP parallel region. 
+///  Struct supporting OpenMP parallel region.
 ///
 struct omp_parallel_region
     : make_policy_pattern_launch_platform_t<Policy::openmp,
@@ -134,11 +134,10 @@ struct omp_parallel_region
 ///
 ///  Struct supporting OpenMP parallel region for Teams
 ///
-struct omp_launch_t
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::region,
-                                            Launch::undefined,
-                                            Platform::host> {
+struct omp_launch_t : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                                            Pattern::region,
+                                                            Launch::undefined,
+                                                            Platform::host> {
 };
 
 
@@ -146,15 +145,17 @@ struct omp_launch_t
 ///  Struct supporting OpenMP 'for nowait schedule( )'
 ///
 template <typename Sched>
-struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                              Pattern::forall,
-                                                              Launch::undefined,
-                                                              Platform::host,
-                                                              omp::For,
-                                                              omp::NoWait,
-                                                              Sched> {
-    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+struct omp_for_nowait_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            omp::NoWait,
+                                            Sched> {
+  static_assert(
+      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
 
@@ -162,14 +163,16 @@ struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<Poli
 ///  Struct supporting OpenMP 'for schedule( )'
 ///
 template <typename Sched>
-struct omp_for_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                              Pattern::forall,
-                                                              Launch::undefined,
-                                                              Platform::host,
-                                                              omp::For,
-                                                              Sched> {
-    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+struct omp_for_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            Sched> {
+  static_assert(
+      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
 ///
@@ -196,52 +199,58 @@ using omp_for_runtime_exec = omp_for_schedule_exec<omp::Runtime>;
 
 ///
 ///  Internal type aliases supporting 'omp for schedule( ) nowait' for specific
-///  schedule types. 
+///  schedule types.
 ///
 ///  IMPORTANT: We only provide a nowait policy option for static scheduling
 ///             since that is the only scheduling case that can be used with
-///             nowait and be correct in general. Paraphrasing the OpenMP 
+///             nowait and be correct in general. Paraphrasing the OpenMP
 ///             standard:
-///             
-///             Programs that depend on which thread executes a particular 
+///
+///             Programs that depend on which thread executes a particular
 ///             iteration under any circumstance other than static schedule
 ///             are non-conforming.
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_for_nowait_static_exec = omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
+using omp_for_nowait_static_exec =
+    omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
 
 ///
 ///  Struct supporting OpenMP 'parallel' region containing an inner loop
 ///  execution construct.
 ///
 template <typename InnerPolicy>
-using omp_parallel_exec = make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::forall,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            omp::Parallel,
-                                            wrapper<InnerPolicy>>;
+using omp_parallel_exec =
+    make_policy_pattern_launch_platform_t<Policy::openmp,
+                                          Pattern::forall,
+                                          Launch::undefined,
+                                          Platform::host,
+                                          omp::Parallel,
+                                          wrapper<InnerPolicy>>;
 
 ///
-///  Internal type aliases supporting 'omp parallel for schedule( )' for 
+///  Internal type aliases supporting 'omp parallel for schedule( )' for
 ///  specific schedule types.
 ///
 using omp_parallel_for_exec = omp_parallel_exec<omp_for_exec>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_static_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>> >;
+using omp_parallel_for_static_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>>>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_dynamic_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>> >;
+using omp_parallel_for_dynamic_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>>>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_guided_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>> >;
+using omp_parallel_for_guided_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>>>;
 
 ///
-using omp_parallel_for_runtime_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
+using omp_parallel_for_runtime_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
 
 
 ///
@@ -303,15 +312,15 @@ struct omp_reduce_ordered
 };
 
 ///
-template < typename tuning >
-struct omp_multi_reduce_policy
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::multi_reduce,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            std::conditional_t<tuning::consistent,
-                                                               reduce::ordered,
-                                                               reduce::unordered>> {
+template <typename tuning>
+struct omp_multi_reduce_policy : make_policy_pattern_launch_platform_t<
+                                     Policy::openmp,
+                                     Pattern::multi_reduce,
+                                     Launch::undefined,
+                                     Platform::host,
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>> {
 };
 
 ///
@@ -327,14 +336,15 @@ using omp_atomic = builtin_atomic;
 
 #else  // RAJA_COMPILER_MSVC not defined
 
-struct omp_atomic {};
+struct omp_atomic {
+};
 
 #endif
 
 
-template < RAJA::omp::multi_reduce_algorithm algorithm >
-using omp_multi_reduce_tuning = omp_multi_reduce_policy<
-    RAJA::omp::MultiReduceTuning<algorithm> >;
+template <RAJA::omp::multi_reduce_algorithm algorithm>
+using omp_multi_reduce_tuning =
+    omp_multi_reduce_policy<RAJA::omp::MultiReduceTuning<algorithm>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - combine_on_destruction policies combine new values into a single value for
@@ -344,8 +354,8 @@ using omp_multi_reduce_combine_on_destruction = omp_multi_reduce_tuning<
     RAJA::omp::multi_reduce_algorithm::combine_on_destruction>;
 // - combine_on_get policies combine new values into a single value for
 //   each thread then when get is called those values are combined.
-using omp_multi_reduce_combine_on_get = omp_multi_reduce_tuning<
-    RAJA::omp::multi_reduce_algorithm::combine_on_get>;
+using omp_multi_reduce_combine_on_get =
+    omp_multi_reduce_tuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>;
 
 // Policy for RAJA::MultiReduce* objects that gives the
 // same answer every time when used in the same way
@@ -395,18 +405,19 @@ using policy::omp::omp_parallel_for_segit;
 using policy::omp::omp_parallel_segit;
 
 ///
-/// Type alias for omp parallel region containing an inner 'omp for' loop 
+/// Type alias for omp parallel region containing an inner 'omp for' loop
 /// execution policy. Inner policy types follow.
 ///
 using policy::omp::omp_parallel_exec;
 
 ///
-/// Type alias for 'omp for' loop execution within an omp_parallel_exec construct
+/// Type alias for 'omp for' loop execution within an omp_parallel_exec
+/// construct
 ///
 using policy::omp::omp_for_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
 /// scheduling policy within an omp_parallel_exec construct
 /// Scheduling policies are near the top of this file and include:
 /// RAJA::policy::omp::{Auto, Static, Dynamic, Guided, Runtime}
@@ -421,7 +432,7 @@ using policy::omp::omp_for_schedule_exec;
 using policy::omp::omp_for_nowait_schedule_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
 /// static scheduling policy within an omp_parallel_exec construct
 ///
 using policy::omp::omp_for_static_exec;
@@ -437,8 +448,8 @@ using policy::omp::omp_for_runtime_exec;
 ///
 /// Type aliases for omp parallel region
 ///
-using policy::omp::omp_parallel_region;
 using policy::omp::omp_launch_t;
+using policy::omp::omp_parallel_region;
 
 ///
 /// Type aliases for omp reductions
diff --git a/include/RAJA/policy/openmp/reduce.hpp b/include/RAJA/policy/openmp/reduce.hpp
index 7ccc68c3a1..55c19d6d71 100644
--- a/include/RAJA/policy/openmp/reduce.hpp
+++ b/include/RAJA/policy/openmp/reduce.hpp
@@ -25,17 +25,15 @@
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-#include <memory>
-#include <vector>
-
 #include <omp.h>
 
-#include "RAJA/util/types.hpp"
+#include <memory>
+#include <vector>
 
 #include "RAJA/pattern/detail/reduce.hpp"
 #include "RAJA/pattern/reduce.hpp"
-
 #include "RAJA/policy/openmp/policy.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
diff --git a/include/RAJA/policy/openmp/region.hpp b/include/RAJA/policy/openmp/region.hpp
index 88f0519abf..6cc3e1ab4c 100644
--- a/include/RAJA/policy/openmp/region.hpp
+++ b/include/RAJA/policy/openmp/region.hpp
@@ -39,11 +39,11 @@ RAJA_INLINE void region_impl(const omp_parallel_region &, Func &&body)
 {
 
 #pragma omp parallel
-    { // curly brackets to ensure body() is encapsulated in omp parallel region
-      //thread private copy of body
-      auto loopbody = body;
-      loopbody();
-    }
+  {  // curly brackets to ensure body() is encapsulated in omp parallel region
+    // thread private copy of body
+    auto loopbody = body;
+    loopbody();
+  }
 }
 
 }  // namespace omp
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index 97cd7a8ab8..5dce537455 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -18,7 +18,7 @@
 #ifndef RAJA_scan_openmp_HPP
 #define RAJA_scan_openmp_HPP
 
-#include "RAJA/config.hpp"
+#include <omp.h>
 
 #include <algorithm>
 #include <functional>
@@ -26,11 +26,10 @@
 #include <type_traits>
 #include <vector>
 
-#include <omp.h>
-
+#include "RAJA/config.hpp"
+#include "RAJA/pattern/detail/algorithm.hpp"
 #include "RAJA/policy/openmp/policy.hpp"
 #include "RAJA/policy/sequential/scan.hpp"
-#include "RAJA/pattern/detail/algorithm.hpp"
 
 namespace RAJA
 {
@@ -44,18 +43,16 @@ namespace scan
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter begin,
-    Iter end,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
-  using std::distance;
   using RAJA::detail::firstIndex;
+  using std::distance;
   using Value = typename ::std::iterator_traits<Iter>::value_type;
   const auto n = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
@@ -68,14 +65,18 @@ inclusive_inplace(
     const DistanceT idx_begin = firstIndex(n, p, pid);
     const DistanceT idx_end = firstIndex(n, p, pid + 1);
     if (idx_begin != idx_end) {
-      inclusive_inplace(host_res, ::RAJA::seq_exec{},
-                        begin + idx_begin, begin + idx_end, f);
+      inclusive_inplace(
+          host_res, ::RAJA::seq_exec{}, begin + idx_begin, begin + idx_end, f);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, ::RAJA::seq_exec{},
-                      sums.data(), sums.data() + p, f, BinFn::identity());
+    exclusive_inplace(host_res,
+                      ::RAJA::seq_exec{},
+                      sums.data(),
+                      sums.data() + p,
+                      f,
+                      BinFn::identity());
     for (auto i = idx_begin; i < idx_end; ++i) {
       begin[i] = f(begin[i], sums[pid]);
     }
@@ -89,19 +90,17 @@ inclusive_inplace(
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn, typename ValueT>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter begin,
-    Iter end,
-    BinFn f,
-    ValueT v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  ValueT v)
 {
-  using std::distance;
   using RAJA::detail::firstIndex;
+  using std::distance;
   using Value = typename ::std::iterator_traits<Iter>::value_type;
   const auto n = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
@@ -116,14 +115,18 @@ exclusive_inplace(
     const Value init = ((pid == 0) ? v : *(begin + idx_begin - 1));
 #pragma omp barrier
     if (idx_begin != idx_end) {
-      exclusive_inplace(host_res, seq_exec{},
-                        begin + idx_begin, begin + idx_end, f, init);
+      exclusive_inplace(
+          host_res, seq_exec{}, begin + idx_begin, begin + idx_end, f, init);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, seq_exec{},
-                      sums.data(), sums.data() + p, f, BinFn::identity());
+    exclusive_inplace(host_res,
+                      seq_exec{},
+                      sums.data(),
+                      sums.data() + p,
+                      f,
+                      BinFn::identity());
     for (auto i = idx_begin; i < idx_end; ++i) {
       begin[i] = f(begin[i], sums[pid]);
     }
@@ -137,16 +140,14 @@ exclusive_inplace(
    initial value
 */
 template <typename Policy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-inclusive(
-    resources::Host host_res,
-    const Policy& exec,
-    Iter begin,
-    Iter end,
-    OutIter out,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f)
 {
   using std::distance;
   ::std::copy(begin, end, out);
@@ -162,21 +163,20 @@ template <typename Policy,
           typename OutIter,
           typename BinFn,
           typename ValueT>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-exclusive(
-    resources::Host host_res,
-    const Policy& exec,
-    Iter begin,
-    Iter end,
-    OutIter out,
-    BinFn f,
-    ValueT v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f,
+          ValueT v)
 {
   using std::distance;
   ::std::copy(begin, end, out);
-  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f, v);
+  return exclusive_inplace(
+      host_res, exec, out, out + distance(begin, end), f, v);
 }
 
 }  // namespace scan
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index 9e4474d692..09126b4585 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -18,21 +18,18 @@
 #ifndef RAJA_sort_openmp_HPP
 #define RAJA_sort_openmp_HPP
 
-#include "RAJA/config.hpp"
+#include <omp.h>
 
 #include <algorithm>
 #include <functional>
 #include <iterator>
 
-#include <omp.h>
-
-#include "RAJA/util/macros.hpp"
-
-#include "RAJA/util/concepts.hpp"
-
+#include "RAJA/config.hpp"
+#include "RAJA/pattern/detail/algorithm.hpp"
 #include "RAJA/policy/openmp/policy.hpp"
 #include "RAJA/policy/sequential/sort.hpp"
-#include "RAJA/pattern/detail/algorithm.hpp"
+#include "RAJA/util/concepts.hpp"
+#include "RAJA/util/macros.hpp"
 
 namespace RAJA
 {
@@ -67,11 +64,11 @@ inline void sort_task(Sorter sorter,
 
   if (n <= iterates_per_task) {
 
-    sorter(begin+i_begin, begin+i_end, comp);
+    sorter(begin + i_begin, begin + i_end, comp);
 
   } else {
 
-    const diff_type i_middle = i_begin + n/2;
+    const diff_type i_middle = i_begin + n / 2;
 
 #pragma omp task
     sort_task(sorter, begin, i_begin, i_middle, iterates_per_task, comp);
@@ -81,8 +78,12 @@ inline void sort_task(Sorter sorter,
 
 #pragma omp taskwait
 
-    //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
-    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+    // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
+    // comp);
+    RAJA::detail::inplace_merge(begin + i_begin,
+                                begin + i_middle,
+                                begin + i_end,
+                                comp);
   }
 }
 
@@ -114,20 +115,31 @@ inline void sort_parallel_region(Sorter sorter,
   }
 
   // hierarchically merge ranges
-  for (diff_type middle_offset = 1; middle_offset < num_threads; middle_offset *= 2) {
+  for (diff_type middle_offset = 1; middle_offset < num_threads;
+       middle_offset *= 2) {
 
-    diff_type end_offset = 2*middle_offset;
+    diff_type end_offset = 2 * middle_offset;
 
-    const diff_type i_middle = firstIndex(n, num_threads, std::min(thread_id + middle_offset, num_threads));
-    const diff_type i_end    = firstIndex(n, num_threads, std::min(thread_id + end_offset,    num_threads));
+    const diff_type i_middle =
+        firstIndex(n,
+                   num_threads,
+                   std::min(thread_id + middle_offset, num_threads));
+    const diff_type i_end =
+        firstIndex(n,
+                   num_threads,
+                   std::min(thread_id + end_offset, num_threads));
 
 #pragma omp barrier
 
     if (thread_id % end_offset == 0) {
 
       // this thread merges ranges [i_begin, i_middle) and [i_middle, i_end)
-      //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
-      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+      // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
+      // comp);
+      RAJA::detail::inplace_merge(begin + i_begin,
+                                  begin + i_middle,
+                                  begin + i_end,
+                                  comp);
     }
   }
 }
@@ -139,11 +151,7 @@ inline void sort_parallel_region(Sorter sorter,
         \brief sort given range using sorter and comparison function
 */
 template <typename Sorter, typename Iter, typename Compare>
-inline
-void sort(Sorter sorter,
-          Iter begin,
-          Iter end,
-          Compare comp)
+inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
 {
   using diff_type = RAJA::detail::IterDiff<Iter>;
 
@@ -161,12 +169,14 @@ void sort(Sorter sorter,
 
 #if defined(RAJA_ENABLE_OPENMP_TASK_INTERNAL)
 
-    const diff_type iterates_per_task = std::max(n/(2*max_threads), min_iterates_per_task);
+    const diff_type iterates_per_task =
+        std::max(n / (2 * max_threads), min_iterates_per_task);
 
-    const diff_type requested_num_threads = std::min((n+iterates_per_task-1)/iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
+    const diff_type requested_num_threads =
+        std::min((n + iterates_per_task - 1) / iterates_per_task, max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
-#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
+#pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
 #pragma omp master
     {
       sort_task(sorter, begin, 0, n, iterates_per_task, comp);
@@ -174,10 +184,12 @@ void sort(Sorter sorter,
 
 #else
 
-    const diff_type requested_num_threads = std::min((n+min_iterates_per_task-1)/min_iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
+    const diff_type requested_num_threads =
+        std::min((n + min_iterates_per_task - 1) / min_iterates_per_task,
+                 max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
-#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
+#pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
     {
       sort_parallel_region(sorter, begin, n, comp);
     }
@@ -186,9 +198,9 @@ void sort(Sorter sorter,
   }
 }
 
-} // namespace openmp
+}  // namespace openmp
 
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief sort given range using comparison function
@@ -196,12 +208,11 @@ void sort(Sorter sorter,
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
   detail::openmp::sort(detail::UnstableSorter{}, begin, end, comp);
 
@@ -214,12 +225,11 @@ unstable(
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
   detail::openmp::sort(detail::StableSorter{}, begin, end, comp);
 
@@ -229,43 +239,54 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
-  auto begin  = RAJA::zip(keys_begin, vals_begin);
-  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin = RAJA::zip(keys_begin, vals_begin);
+  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::UnstableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::UnstableSorter{},
+                       begin,
+                       end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on keys
+        \brief stable sort given range of pairs using comparison function on
+   keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
-  auto begin  = RAJA::zip(keys_begin, vals_begin);
-  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin = RAJA::zip(keys_begin, vals_begin);
+  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::StableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::StableSorter{},
+                       begin,
+                       end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp
index af88127636..cccd77d1fc 100644
--- a/include/RAJA/policy/openmp_target.hpp
+++ b/include/RAJA/policy/openmp_target.hpp
@@ -26,14 +26,15 @@
 
 #include <omp.h>
 
-#include "RAJA/policy/openmp_target/policy.hpp"
-#include "RAJA/policy/openmp_target/kernel.hpp"
 #include "RAJA/policy/openmp_target/forall.hpp"
+#include "RAJA/policy/openmp_target/kernel.hpp"
+#include "RAJA/policy/openmp_target/policy.hpp"
 #include "RAJA/policy/openmp_target/reduce.hpp"
 //#include "RAJA/policy/openmp_target/multi_reduce.hpp"
 #include "RAJA/policy/openmp_target/WorkGroup.hpp"
 
 
-#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) && defined(RAJA_ENABLE_TARGET_OPENMP)
+#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) &&
+        // defined(RAJA_ENABLE_TARGET_OPENMP)
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
index a4a4a62903..59eabd874c 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
@@ -19,10 +19,8 @@
 #define RAJA_openmp_target_WorkGroup_Dispatcher_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/policy/openmp_target/policy.hpp"
-
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
+#include "RAJA/policy/openmp_target/policy.hpp"
 
 
 namespace RAJA
@@ -36,12 +34,12 @@ namespace omp_target
 
 // create the value in a target region using the factory, map the value
 // back, and return the value created in the target region
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory factory)
 {
   typename std::decay_t<Factory>::value_type value;
 
-  #pragma omp target map(tofrom : value) map(to : factory)
+#pragma omp target map(tofrom : value) map(to : factory)
   {
     value = factory();
   }
@@ -51,7 +49,7 @@ inline auto get_value(Factory factory)
 
 // get the device value and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -61,17 +59,17 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace omp_target
 
 /*!
-* Populate and return a Dispatcher object that can be used in omp target regions
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object that can be used in omp target
+ * regions
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_target_work const&)
 {
   static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return omp_target::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+        return omp_target::get_cached_value(
+            std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
index b373d09c61..eb08ad7b15 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
@@ -19,10 +19,8 @@
 #define RAJA_openmp_target_WorkGroup_WorkRunner_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/policy/openmp_target/policy.hpp"
-
 #include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+#include "RAJA/policy/openmp_target/policy.hpp"
 
 
 namespace RAJA
@@ -38,23 +36,21 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_target_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::omp_target_parallel_for_exec_nt,
-        RAJA::omp_target_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...> {
+};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,23 +59,21 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_target_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::omp_target_parallel_for_exec_nt,
-        RAJA::omp_target_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...> {
+};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index 061481cbc1..66f644cf9f 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -14,11 +14,9 @@
 
 #include <omp.h>
 
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/policy/openmp/policy.hpp"
-
 #include "RAJA/pattern/params/forall.hpp"
+#include "RAJA/policy/openmp/policy.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -33,13 +31,15 @@ namespace omp
 /// OpenMP target parallel for policy implementation
 ///
 
-template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template <size_t ThreadsPerTeam,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
             Iterable&& iter,
@@ -57,26 +57,27 @@ forall_impl(resources::Omp omp_res,
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if ( tperteam > omp::MAXNUMTHREADS )
-  {
+  if (tperteam > omp::MAXNUMTHREADS) {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
-  if ( numteams > tperteam )
-  {
+  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
+  if (numteams > tperteam) {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
+  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
+  // distance)
   auto i = distance_it;
 
 #pragma omp target teams distribute parallel for num_teams(numteams) \
-    schedule(static, 1) map(to : body,begin_it) reduction(combine: f_params)
+    schedule(static, 1) map(to                                       \
+                            : body, begin_it) reduction(combine      \
+                                                        : f_params)
   for (i = 0; i < distance_it; ++i) {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
@@ -86,13 +87,14 @@ forall_impl(resources::Omp omp_res,
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template <size_t ThreadsPerTeam,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>&,
             Iterable&& iter,
@@ -106,26 +108,26 @@ forall_impl(resources::Omp omp_res,
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if ( tperteam > omp::MAXNUMTHREADS )
-  {
+  if (tperteam > omp::MAXNUMTHREADS) {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
-  if ( numteams > tperteam )
-  {
+  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
+  if (numteams > tperteam) {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
+  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
+  // distance)
   auto i = distance_it;
 
 #pragma omp target teams distribute parallel for num_teams(numteams) \
-    schedule(static, 1) map(to : body,begin_it)
+    schedule(static, 1) map(to                                       \
+                            : body, begin_it)
   for (i = 0; i < distance_it; ++i) {
     Body ib = body;
     ib(begin_it[i]);
@@ -135,16 +137,12 @@ forall_impl(resources::Omp omp_res,
 }
 
 
-
-
-
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt& p,
             Iterable&& iter,
@@ -161,7 +159,8 @@ forall_impl(resources::Omp omp_res,
   RAJA_EXTRACT_BED_IT(iter);
 
 #pragma omp target teams distribute parallel for schedule(static, 1) \
-    firstprivate(body,begin_it) reduction(combine: f_params)
+    firstprivate(body, begin_it) reduction(combine                   \
+                                           : f_params)
   for (decltype(distance_it) i = 0; i < distance_it; ++i) {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
@@ -172,12 +171,10 @@ forall_impl(resources::Omp omp_res,
 }
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt&,
             Iterable&& iter,
@@ -190,7 +187,7 @@ forall_impl(resources::Omp omp_res,
   RAJA_EXTRACT_BED_IT(iter);
 
 #pragma omp target teams distribute parallel for schedule(static, 1) \
-    firstprivate(body,begin_it)
+    firstprivate(body, begin_it)
   for (decltype(distance_it) i = 0; i < distance_it; ++i) {
     Body ib = body;
     ib(begin_it[i]);
diff --git a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
index b72147151c..efe99e5489 100644
--- a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
@@ -10,14 +10,19 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA {
-namespace internal {
+namespace RAJA
+{
+namespace internal
+{
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>, Types>
-{
+                                             EnclosedStmts...>,
+                         Types> {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
   {
@@ -32,15 +37,16 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
     auto privatizer = thread_privatize(data);
 #pragma omp target teams distribute parallel for schedule(static, 1) \
     firstprivate(privatizer) collapse(2)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          auto& private_data = privatizer.get_priv();
-          private_data.template assign_offset<Arg0>(i0);
-          private_data.template assign_offset<Arg1>(i1);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
-        }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
+        auto& private_data = privatizer.get_priv();
+        private_data.template assign_offset<Arg0>(i0);
+        private_data.template assign_offset<Arg1>(i1);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
+            private_data);
       }
     }
+  }
 };
 
 template <camp::idx_t Arg0,
@@ -50,8 +56,8 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>, Types>
-{
+                                             EnclosedStmts...>,
+                         Types> {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
   {
@@ -68,18 +74,19 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
     auto privatizer = thread_privatize(data);
 #pragma omp target teams distribute parallel for schedule(static, 1) \
     firstprivate(privatizer) collapse(3)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
-            auto& private_data = privatizer.get_priv();
-            private_data.template assign_offset<Arg0>(i0);
-            private_data.template assign_offset<Arg1>(i1);
-            private_data.template assign_offset<Arg2>(i2);
-            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
-          }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
+        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
+          auto& private_data = privatizer.get_priv();
+          private_data.template assign_offset<Arg0>(i0);
+          private_data.template assign_offset<Arg1>(i1);
+          private_data.template assign_offset<Arg2>(i2);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
+              private_data);
         }
       }
     }
+  }
 };
 
 template <camp::idx_t Arg0,
@@ -90,8 +97,8 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2, Arg3>,
-                                             EnclosedStmts...>, Types>
-{
+                                             EnclosedStmts...>,
+                         Types> {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
   {
@@ -110,24 +117,25 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
     auto privatizer = thread_privatize(data);
 #pragma omp target teams distribute parallel for schedule(static, 1) \
     firstprivate(privatizer) collapse(4)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
-            for (auto i3 = (decltype(l3))0; i3 < l3; ++i3) {
-              auto& private_data = privatizer.get_priv();
-              private_data.template assign_offset<Arg0>(i0);
-              private_data.template assign_offset<Arg1>(i1);
-              private_data.template assign_offset<Arg2>(i2);
-              private_data.template assign_offset<Arg3>(i2);
-              execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(private_data);
-            }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
+        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
+          for (auto i3 = (decltype(l3))0; i3 < l3; ++i3) {
+            auto& private_data = privatizer.get_priv();
+            private_data.template assign_offset<Arg0>(i0);
+            private_data.template assign_offset<Arg1>(i1);
+            private_data.template assign_offset<Arg2>(i2);
+            private_data.template assign_offset<Arg3>(i2);
+            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(
+                private_data);
           }
         }
       }
     }
+  }
 };
 
-}
-}
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // RAJA_policy_openmp_target_kernel_Collapse_HPP
+#endif  // RAJA_policy_openmp_target_kernel_Collapse_HPP
diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index 173230b9e2..38b034f27d 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -10,25 +10,31 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA {
-namespace internal {
-
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct OpenMPTargetForWrapper : public GenericWrapperBase 
+namespace RAJA
+{
+namespace internal
 {
+
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
+struct OpenMPTargetForWrapper : public GenericWrapperBase {
   using data_t = camp::decay<Data>;
 
   data_t data;
 
-  /*! 
+  /*!
    * \brief Deferences data so that it can be mapped to the device
    */
   RAJA_INLINE
-  constexpr explicit OpenMPTargetForWrapper(data_t &d) : 
-    data{d}  {}
+  constexpr explicit OpenMPTargetForWrapper(data_t &d) : data{d} {}
 
   RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
+  void exec()
+  {
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
+  }
 
   template <typename InIndexType>
   RAJA_INLINE void operator()(InIndexType i)
@@ -42,8 +48,10 @@ template <camp::idx_t ArgumentId,
           int N,
           typename... EnclosedStmts,
           typename Types>
-struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec<N>, EnclosedStmts...>, Types>
-{
+struct StatementExecutor<statement::For<ArgumentId,
+                                        omp_target_parallel_for_exec<N>,
+                                        EnclosedStmts...>,
+                         Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &&data)
@@ -51,19 +59,23 @@ struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
+    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...>
+        for_wrapper(data);
 
     auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::Omp::get_default();
-    forall_impl(r, omp_target_parallel_for_exec<N>{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r,
+                omp_target_parallel_for_exec<N>{},
+                TypedRangeSegment<len_t>(0, len),
+                for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
 
+}  // namespace internal
+}  // namespace RAJA
 
-}
-}
-
-#endif // RAJA_policy_openmp_kernel_For_HPP
+#endif  // RAJA_policy_openmp_kernel_For_HPP
diff --git a/include/RAJA/policy/openmp_target/params/kernel_name.hpp b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
index 5e9edb4b6c..18609ee69a 100644
--- a/include/RAJA/policy/openmp_target/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
@@ -3,38 +3,43 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  combine(KernelName&, T& /*place holder argument*/) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL> > init(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL> >
+combine(KernelName&, T& /*place holder argument*/)
+{
+}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL> >
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/params/reduce.hpp b/include/RAJA/policy/openmp_target/params/reduce.hpp
index 34c23fb5db..710ddce97a 100644
--- a/include/RAJA/policy/openmp_target/params/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/params/reduce.hpp
@@ -3,37 +3,43 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL> > init(
+    Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL> >
+combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL> >
+resolve(Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/policy.hpp b/include/RAJA/policy/openmp_target/policy.hpp
index 520f5afc55..cd41b57719 100644
--- a/include/RAJA/policy/openmp_target/policy.hpp
+++ b/include/RAJA/policy/openmp_target/policy.hpp
@@ -10,10 +10,13 @@
 
 #include "RAJA/policy/PolicyBase.hpp"
 
-namespace RAJA {
+namespace RAJA
+{
 
-namespace policy {
-namespace omp {
+namespace policy
+{
+namespace omp
+{
 
 // Max number of CUDA reduction threads per block possible.
 // Required for allocating omp target data before execution policy.
@@ -36,31 +39,33 @@ struct Collapse {
 template <size_t ThreadsPerTeam>
 struct omp_target_parallel_for_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Teams<ThreadsPerTeam>,
-                            omp::Distribute> {
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Teams<ThreadsPerTeam>,
+                                     omp::Distribute> {
 };
 
 struct omp_target_parallel_for_exec_nt
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Distribute> {
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Distribute> {
 };
 
 struct omp_target_parallel_collapse_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Collapse> {
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Collapse> {
 };
 
 struct omp_target_reduce
-    : make_policy_pattern_platform_t<Policy::target_openmp, Pattern::reduce, Platform::omp_target> {
+    : make_policy_pattern_platform_t<Policy::target_openmp,
+                                     Pattern::reduce,
+                                     Platform::omp_target> {
 };
 
 ///
@@ -74,17 +79,17 @@ struct omp_target_work
 };
 
 
-}  // closing brace for omp namespace
-}  // closing brace for policy namespace
+}  // namespace omp
+}  // namespace policy
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
+using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_parallel_for_exec;
 using policy::omp::omp_target_parallel_for_exec_nt;
 using policy::omp::omp_target_reduce;
-using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_work;
 #endif
 
-} // closing brace for RAJA namespace
+}  // namespace RAJA
 
-#endif // RAJA_policy_openmp_target_HPP
+#endif  // RAJA_policy_openmp_target_HPP
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index 6691729bbe..82a6406225 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -14,15 +14,13 @@
 
 //#include <cassert>  // Leaving out until XL is fixed 2/25/2019.
 
-#include <algorithm>
-
 #include <omp.h>
 
-#include "RAJA/util/types.hpp"
+#include <algorithm>
 
 #include "RAJA/pattern/reduce.hpp"
-
 #include "RAJA/policy/openmp/policy.hpp"
+#include "RAJA/util/types.hpp"
 
 
 namespace RAJA
@@ -33,8 +31,7 @@ namespace omp
 #pragma omp declare target
 
 template <typename T, typename I>
-struct minloc 
-{
+struct minloc {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
   RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
                                                I &loc,
@@ -49,8 +46,7 @@ struct minloc
 };
 
 template <typename T, typename I>
-struct maxloc 
-{
+struct maxloc {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
   RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
                                                I &loc,
@@ -70,8 +66,7 @@ struct maxloc
 static constexpr int MaxNumTeams = policy::omp::MAXNUMTHREADS;
 
 //! Information necessary for OpenMP offload to be considered
-struct Offload_Info 
-{
+struct Offload_Info {
   int hostID{omp_get_initial_device()};
   int deviceID{omp_get_default_device()};
   bool isMapped{false};
@@ -87,8 +82,7 @@ struct Offload_Info
 //! Reduction data for OpenMP Offload -- stores value, host pointer, and device
 //! pointer
 template <typename T>
-struct Reduce_Data
-{
+struct Reduce_Data {
   mutable T value;
   T *device;
   T *host;
@@ -101,7 +95,7 @@ struct Reduce_Data
    *  allocates data on the host and device and initializes values to default
    */
   Reduce_Data(T initValue, T identityValue, Offload_Info &info)
-     : value(initValue),
+      : value(initValue),
         device{reinterpret_cast<T *>(
             omp_target_alloc(omp::MaxNumTeams * sizeof(T), info.deviceID))},
         host{new T[omp::MaxNumTeams]}
@@ -118,10 +112,7 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue)
-  {
-    value = initValue;
-  }
+  void reset(T initValue) { value = initValue; }
 
 
   //! default copy constructor for POD
@@ -178,8 +169,7 @@ struct Reduce_Data
 //! OpenMP Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
 template <typename Reducer, typename T>
-struct TargetReduce 
-{
+struct TargetReduce {
   TargetReduce() = delete;
   TargetReduce(const TargetReduce &) = default;
 
@@ -199,13 +189,14 @@ struct TargetReduce
     finalVal = identity_;
   }
 
-#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
 #pragma omp declare target
 #endif
   //! apply reduction on device upon destruction
   ~TargetReduce()
   {
-    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
+    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
+    // XL is fixed 2/25/2019.
     if (!omp_is_initial_device()) {
 #pragma omp critical
       {
@@ -214,7 +205,7 @@ struct TargetReduce
       }
     }
   }
-#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
 #pragma omp end declare target
 #endif
 
@@ -264,13 +255,15 @@ struct TargetReduce
 //! OpenMP Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
 template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc 
-{
+struct TargetReduceLoc {
   TargetReduceLoc() = delete;
   TargetReduceLoc(const TargetReduceLoc &) = default;
-  explicit TargetReduceLoc(T init_val_, IndexType init_loc,
-                           T identity_val_ = Reducer::identity,
-                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  explicit TargetReduceLoc(
+      T init_val_,
+      IndexType init_loc,
+      T identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -281,9 +274,11 @@ struct TargetReduceLoc
   {
   }
 
-  void reset(T init_val_, IndexType init_loc_,
+  void reset(T init_val_,
+             IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     operator T();
     val.reset(identity_val_);
@@ -297,7 +292,8 @@ struct TargetReduceLoc
   //! apply reduction on device upon destruction
   ~TargetReduceLoc()
   {
-    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
+    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
+    // XL is fixed 2/25/2019.
     if (!omp_is_initial_device()) {
 #pragma omp critical
       {
@@ -372,7 +368,6 @@ class ReduceSum<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-
   using self = ReduceSum<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
@@ -398,7 +393,6 @@ class ReduceBitOr<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-
   using self = ReduceBitOr<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
@@ -424,7 +418,6 @@ class ReduceBitAnd<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-
   using self = ReduceBitAnd<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
@@ -450,7 +443,6 @@ class ReduceMin<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-
   using self = ReduceMin<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
@@ -477,7 +469,6 @@ class ReduceMax<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-
   using self = ReduceMax<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
@@ -503,10 +494,8 @@ class ReduceMinLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>
 {
 public:
-
   using self = ReduceMinLoc<omp_target_reduce, T, IndexType>;
-  using parent =
-      TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
+  using parent = TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
@@ -531,10 +520,8 @@ class ReduceMaxLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>
 {
 public:
-
   using self = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
-  using parent =
-      TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
+  using parent = TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index 0963b31a01..e9d2463dc8 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -21,17 +21,17 @@
 #define RAJA_sequential_HPP
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
+#include "RAJA/policy/sequential/WorkGroup.hpp"
 #include "RAJA/policy/sequential/forall.hpp"
 #include "RAJA/policy/sequential/kernel.hpp"
+#include "RAJA/policy/sequential/launch.hpp"
+#include "RAJA/policy/sequential/multi_reduce.hpp"
 #include "RAJA/policy/sequential/policy.hpp"
 #include "RAJA/policy/sequential/reduce.hpp"
-#include "RAJA/policy/sequential/multi_reduce.hpp"
 #include "RAJA/policy/sequential/scan.hpp"
 #include "RAJA/policy/sequential/sort.hpp"
-#include "RAJA/policy/sequential/launch.hpp"
-#include "RAJA/policy/sequential/WorkGroup.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
index 13796fd8a3..691bab53c0 100644
--- a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
@@ -19,10 +19,8 @@
 #define RAJA_sequential_WorkGroup_Dispatcher_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/policy/sequential/policy.hpp"
-
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
+#include "RAJA/policy/sequential/policy.hpp"
 
 
 namespace RAJA
@@ -32,12 +30,12 @@ namespace detail
 {
 
 /*!
-* Populate and return a Dispatcher object
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(seq_work const&)
 {
-  static Dispatcher_T dispatcher{ Dispatcher_T::template makeDispatcher<T>() };
+  static Dispatcher_T dispatcher{Dispatcher_T::template makeDispatcher<T>()};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
index 31e401bf88..8a1870ce56 100644
--- a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
@@ -19,10 +19,8 @@
 #define RAJA_sequential_WorkGroup_WorkRunner_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/policy/sequential/policy.hpp"
-
 #include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+#include "RAJA/policy/sequential/policy.hpp"
 
 
 namespace RAJA
@@ -38,23 +36,20 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::seq_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::seq_exec,
-        RAJA::seq_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallOrdered<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...> {
+};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,23 +58,20 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::seq_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::seq_exec,
-        RAJA::seq_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallReverse<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::reverse_ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...> {
+};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp
index 046e52e1c1..62d44c9fb9 100644
--- a/include/RAJA/policy/sequential/atomic.hpp
+++ b/include/RAJA/policy/sequential/atomic.hpp
@@ -19,7 +19,6 @@
 #define RAJA_policy_sequential_atomic_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/util/macros.hpp"
 
 namespace RAJA
@@ -27,24 +26,21 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicLoad(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(seq_atomic, T *acc)
 {
   return *acc;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void atomicStore(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(seq_atomic, T *acc, T value)
 {
   *acc = value;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc += value;
@@ -54,8 +50,7 @@ RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc -= value;
@@ -65,8 +60,7 @@ RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc = ret < value ? ret : value;
@@ -75,8 +69,7 @@ RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc = value < ret ? ret : value;
@@ -86,8 +79,7 @@ RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T *acc)
 {
   T ret = *acc;
   (*acc) += T(1);
@@ -96,8 +88,7 @@ RAJA_INLINE T atomicInc(seq_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val)
 {
   T old = *acc;
   *acc = val <= old ? T(0) : old + T(1);
@@ -106,8 +97,7 @@ RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T *acc)
 {
   T ret = *acc;
   (*acc) -= T(1);
@@ -116,8 +106,7 @@ RAJA_INLINE T atomicDec(seq_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val)
 {
   T old = *acc;
   *acc = old == T(0) || val < old ? val : old - T(1);
@@ -126,8 +115,7 @@ RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc &= value;
@@ -136,8 +124,7 @@ RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc |= value;
@@ -146,8 +133,7 @@ RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc ^= value;
@@ -156,8 +142,7 @@ RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc = value;
@@ -166,8 +151,7 @@ RAJA_INLINE T atomicExchange(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(seq_atomic, T *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(seq_atomic, T *acc, T compare, T value)
 {
   T ret = *acc;
   *acc = ret == compare ? value : ret;
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index 5d1d6d84b0..746ae151e7 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -24,18 +24,12 @@
 #define RAJA_forall_sequential_HPP
 
 #include "RAJA/config.hpp"
-
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/policy/sequential/policy.hpp"
-
 #include "RAJA/internal/fault_tolerance.hpp"
-
 #include "RAJA/pattern/detail/forall.hpp"
-
-#include "RAJA/util/resource.hpp"
-
 #include "RAJA/pattern/params/forall.hpp"
+#include "RAJA/policy/sequential/policy.hpp"
+#include "RAJA/util/resource.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -55,13 +49,14 @@ namespace sequential
 //////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename Func, typename Resource, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<Resource>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template <typename Iterable,
+          typename Func,
+          typename Resource,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Resource>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(Resource res,
             const seq_exec &,
             Iterable &&iter,
@@ -80,13 +75,14 @@ forall_impl(Resource res,
   return resources::EventProxy<Resource>(res);
 }
 
-template <typename Iterable, typename Func, typename Resource, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<Resource>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template <typename Iterable,
+          typename Func,
+          typename Resource,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Resource>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(Resource res,
             const seq_exec &,
             Iterable &&iter,
diff --git a/include/RAJA/policy/sequential/kernel/Collapse.hpp b/include/RAJA/policy/sequential/kernel/Collapse.hpp
index 8e600ec2e8..2e55cf45f7 100644
--- a/include/RAJA/policy/sequential/kernel/Collapse.hpp
+++ b/include/RAJA/policy/sequential/kernel/Collapse.hpp
@@ -32,7 +32,8 @@ namespace internal
 //
 template <typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>, Types> {
+    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>,
+    Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &data)
@@ -47,10 +48,13 @@ struct StatementExecutor<
 // Executor that handles collapsing of an arbitrarily deep set of seq_exec
 // loops
 //
-template <camp::idx_t Arg0, camp::idx_t... ArgRest, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Collapse<seq_exec,
-                                             ArgList<Arg0, ArgRest...>,
-                                             EnclosedStmts...>, Types> {
+template <camp::idx_t Arg0,
+          camp::idx_t... ArgRest,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<
+    statement::Collapse<seq_exec, ArgList<Arg0, ArgRest...>, EnclosedStmts...>,
+    Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &data)
@@ -61,7 +65,8 @@ struct StatementExecutor<statement::Collapse<seq_exec,
 
     // compute next-most inner loop Executor
     using next_loop_t = StatementExecutor<
-        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>, NewTypes>;
+        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>,
+        NewTypes>;
 
     auto len0 = segment_length<Arg0>(data);
 
diff --git a/include/RAJA/policy/sequential/kernel/Reduce.hpp b/include/RAJA/policy/sequential/kernel/Reduce.hpp
index 7280844320..1556e5e2a2 100644
--- a/include/RAJA/policy/sequential/kernel/Reduce.hpp
+++ b/include/RAJA/policy/sequential/kernel/Reduce.hpp
@@ -34,7 +34,8 @@ template <template <typename...> class ReduceOperator,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>, Types> {
+    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>,
+    Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &&data)
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index a2025a71d5..70ec391644 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -19,8 +19,8 @@
 #define RAJA_pattern_launch_sequential_HPP
 
 #include "RAJA/pattern/launch/launch_core.hpp"
-#include "RAJA/policy/sequential/policy.hpp"
 #include "RAJA/pattern/params/forall.hpp"
+#include "RAJA/policy/sequential/policy.hpp"
 
 namespace RAJA
 {
@@ -28,8 +28,8 @@ namespace RAJA
 template <>
 struct LaunchExecute<RAJA::null_launch_t> {
   template <typename BODY>
-  static void exec(LaunchContext const& RAJA_UNUSED_ARG(ctx),
-                   BODY const& RAJA_UNUSED_ARG(body))
+  static void exec(LaunchContext const &RAJA_UNUSED_ARG(ctx),
+                   BODY const &RAJA_UNUSED_ARG(body))
   {
     RAJA_ABORT_OR_THROW("NULL Launch");
   }
@@ -40,11 +40,15 @@ template <>
 struct LaunchExecute<RAJA::seq_launch_t> {
 
   template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *RAJA_UNUSED_ARG(kernel_name),
-       BODY const &body, ReduceParams &RAJA_UNUSED_ARG(ReduceParams))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const &params,
+       const char *RAJA_UNUSED_ARG(kernel_name),
+       BODY const &body,
+       ReduceParams &RAJA_UNUSED_ARG(ReduceParams))
   {
 
     LaunchContext ctx;
@@ -60,12 +64,17 @@ struct LaunchExecute<RAJA::seq_launch_t> {
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename BODY, typename ReduceParams>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *RAJA_UNUSED_ARG(kernel_name), BODY const &body, ReduceParams &launch_reducers)
+  template <typename BODY, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const &launch_params,
+       const char *RAJA_UNUSED_ARG(kernel_name),
+       BODY const &body,
+       ReduceParams &launch_reducers)
   {
     expt::ParamMultiplexer::init<seq_exec>(launch_reducers);
 
@@ -82,7 +91,6 @@ struct LaunchExecute<RAJA::seq_launch_t> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
@@ -91,9 +99,8 @@ struct LoopExecute<seq_exec, SEGMENT> {
 
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(SEGMENT const &segment,
+                                                BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -160,7 +167,6 @@ struct LoopExecute<seq_exec, SEGMENT> {
       }
     }
   }
-
 };
 
 
@@ -179,7 +185,7 @@ struct LoopICountExecute<seq_exec, SEGMENT> {
     }
   }
 
-    template <typename BODY>
+  template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       SEGMENT const &segment0,
@@ -218,15 +224,17 @@ struct LoopICountExecute<seq_exec, SEGMENT> {
         for (int i = 0; i < len0; i++) {
           body(*(segment0.begin() + i),
                *(segment1.begin() + j),
-               *(segment2.begin() + k), i, j, k);
+               *(segment2.begin() + k),
+               i,
+               j,
+               k);
         }
       }
     }
   }
-
 };
 
-//Tile Execute + variants
+// Tile Execute + variants
 
 template <typename SEGMENT>
 struct TileExecute<seq_exec, SEGMENT> {
@@ -241,12 +249,10 @@ struct TileExecute<seq_exec, SEGMENT> {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = 0; tx < len; tx += tile_size)
-    {
+    for (int tx = 0; tx < len; tx += tile_size) {
       body(segment.slice(tx, tile_size));
     }
   }
-
 };
 
 template <typename SEGMENT>
@@ -262,12 +268,10 @@ struct TileTCountExecute<seq_exec, SEGMENT> {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = 0, bx=0; tx < len; tx += tile_size, bx++)
-    {
+    for (int tx = 0, bx = 0; tx < len; tx += tile_size, bx++) {
       body(segment.slice(tx, tile_size), bx);
     }
   }
-
 };
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sequential/multi_reduce.hpp b/include/RAJA/policy/sequential/multi_reduce.hpp
index be3a3860f8..40de5adf58 100644
--- a/include/RAJA/policy/sequential/multi_reduce.hpp
+++ b/include/RAJA/policy/sequential/multi_reduce.hpp
@@ -22,14 +22,10 @@
 #define RAJA_sequential_multi_reduce_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/internal/MemUtils_CPU.hpp"
-
 #include "RAJA/pattern/detail/multi_reduce.hpp"
 #include "RAJA/pattern/multi_reduce.hpp"
-
 #include "RAJA/policy/sequential/policy.hpp"
-
 #include "RAJA/util/types.hpp"
 
 namespace RAJA
@@ -47,7 +43,7 @@ namespace detail
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataSeq;
 
 /*!
@@ -59,37 +55,40 @@ struct MultiReduceDataSeq;
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataSeq<T, t_MultiReduceOp,
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataSeq<
+    T,
+    t_MultiReduceOp,
     RAJA::sequential::MultiReduceTuning<
-      RAJA::sequential::multi_reduce_algorithm::left_fold>>
-{
+        RAJA::sequential::multi_reduce_algorithm::left_fold>> {
   using value_type = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataSeq() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr>
   MultiReduceDataSeq(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_num_bins(container.size())
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_num_bins(container.size()),
+        m_identity(identity),
+        m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataSeq(MultiReduceDataSeq const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_identity(other.m_identity)
-      , m_data(other.m_data)
-  { }
+  MultiReduceDataSeq(MultiReduceDataSeq const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_identity(other.m_identity),
+        m_data(other.m_data)
+  {
+  }
 
-  MultiReduceDataSeq(MultiReduceDataSeq &&) = delete;
+  MultiReduceDataSeq(MultiReduceDataSeq&&) = delete;
   MultiReduceDataSeq& operator=(MultiReduceDataSeq const&) = delete;
-  MultiReduceDataSeq& operator=(MultiReduceDataSeq &&) = delete;
+  MultiReduceDataSeq& operator=(MultiReduceDataSeq&&) = delete;
 
   ~MultiReduceDataSeq()
   {
@@ -100,7 +99,7 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     m_identity = identity;
@@ -122,27 +121,27 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp{}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataSeq const *m_parent;
+  MultiReduceDataSeq const* m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template < typename Container >
+  template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
   {
     if (num_bins == size_t(0)) {
       return nullptr;
     }
 
-    auto data = static_cast<T*>(malloc(num_bins*sizeof(T)));
+    auto data = static_cast<T*>(malloc(num_bins * sizeof(T)));
     size_t bin = 0;
     for (auto const& value : container) {
-      new(&data[bin]) T(value);
+      new (&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -164,7 +163,8 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy, detail::MultiReduceDataSeq)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy,
+                                detail::MultiReduceDataSeq)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/sequential/params/kernel_name.hpp b/include/RAJA/policy/sequential/params/kernel_name.hpp
index 00e6a1dc52..f0e90d229b 100644
--- a/include/RAJA/policy/sequential/params/kernel_name.hpp
+++ b/include/RAJA/policy/sequential/params/kernel_name.hpp
@@ -3,35 +3,40 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  combine(KernelName&, T) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
-
-
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec> > init(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+RAJA_HOST_DEVICE camp::concepts::enable_if<
+    std::is_same<EXEC_POL, RAJA::seq_exec> >
+combine(KernelName&, T)
+{
+}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec> > resolve(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
+
+
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/params/reduce.hpp b/include/RAJA/policy/sequential/params/reduce.hpp
index b77028ca5f..59159dd1bb 100644
--- a/include/RAJA/policy/sequential/params/reduce.hpp
+++ b/include/RAJA/policy/sequential/params/reduce.hpp
@@ -3,33 +3,40 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec> > init(
+    Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
 
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec> > combine(
+    Reducer<OP, T, VOp>& out,
+    const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
+}
 
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec> > resolve(
+    Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index 287af42502..e60d15aaba 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -25,20 +25,16 @@ namespace RAJA
 namespace sequential
 {
 
-enum struct multi_reduce_algorithm : int
-{
-  left_fold
-};
+enum struct multi_reduce_algorithm : int { left_fold };
 
-template < multi_reduce_algorithm t_multi_algorithm >
-struct MultiReduceTuning
-{
+template <multi_reduce_algorithm t_multi_algorithm>
+struct MultiReduceTuning {
   static constexpr multi_reduce_algorithm algorithm = t_multi_algorithm;
   static constexpr bool consistent =
       (algorithm == multi_reduce_algorithm::left_fold);
 };
 
-} // namspace sequential
+}  // namespace sequential
 
 namespace policy
 {
@@ -103,15 +99,15 @@ struct seq_reduce : make_policy_pattern_launch_platform_t<Policy::sequential,
 };
 
 ///
-template < typename tuning >
-struct seq_multi_reduce_policy
-    : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                            Pattern::multi_reduce,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            std::conditional_t<tuning::consistent,
-                                                               reduce::ordered,
-                                                               reduce::unordered>> {
+template <typename tuning>
+struct seq_multi_reduce_policy : make_policy_pattern_launch_platform_t<
+                                     Policy::sequential,
+                                     Pattern::multi_reduce,
+                                     Launch::undefined,
+                                     Platform::host,
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>> {
 };
 
 ///
@@ -125,9 +121,9 @@ struct seq_atomic {
 };
 
 
-template < RAJA::sequential::multi_reduce_algorithm algorithm >
-using seq_multi_reduce_tuning = seq_multi_reduce_policy<
-    RAJA::sequential::MultiReduceTuning<algorithm> >;
+template <RAJA::sequential::multi_reduce_algorithm algorithm>
+using seq_multi_reduce_tuning =
+    seq_multi_reduce_policy<RAJA::sequential::MultiReduceTuning<algorithm>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - left_fold policies combine new values into a single value.
@@ -143,12 +139,12 @@ using seq_multi_reduce = seq_multi_reduce_left_fold;
 
 using policy::sequential::seq_atomic;
 using policy::sequential::seq_exec;
-using policy::sequential::seq_reduce;
+using policy::sequential::seq_launch_t;
 using policy::sequential::seq_multi_reduce;
+using policy::sequential::seq_reduce;
 using policy::sequential::seq_region;
 using policy::sequential::seq_segit;
 using policy::sequential::seq_work;
-using policy::sequential::seq_launch_t;
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sequential/reduce.hpp b/include/RAJA/policy/sequential/reduce.hpp
index 0870726183..35e4123393 100644
--- a/include/RAJA/policy/sequential/reduce.hpp
+++ b/include/RAJA/policy/sequential/reduce.hpp
@@ -22,14 +22,10 @@
 #define RAJA_sequential_reduce_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/internal/MemUtils_CPU.hpp"
-
 #include "RAJA/pattern/detail/reduce.hpp"
 #include "RAJA/pattern/reduce.hpp"
-
 #include "RAJA/policy/sequential/policy.hpp"
-
 #include "RAJA/util/types.hpp"
 
 namespace RAJA
diff --git a/include/RAJA/policy/sequential/scan.hpp b/include/RAJA/policy/sequential/scan.hpp
index 4bcc73366d..25235c255a 100644
--- a/include/RAJA/policy/sequential/scan.hpp
+++ b/include/RAJA/policy/sequential/scan.hpp
@@ -18,17 +18,14 @@
 #ifndef RAJA_scan_sequential_HPP
 #define RAJA_scan_sequential_HPP
 
-#include "RAJA/config.hpp"
-
 #include <algorithm>
 #include <functional>
 #include <iterator>
 
-#include "RAJA/util/macros.hpp"
-
-#include "RAJA/util/concepts.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/policy/sequential/policy.hpp"
+#include "RAJA/util/concepts.hpp"
+#include "RAJA/util/macros.hpp"
 
 namespace RAJA
 {
@@ -41,15 +38,13 @@ namespace scan
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy &,
-    Iter begin,
-    Iter end,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive_inplace(resources::Host host_res,
+                  const ExecPolicy &,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
   ValueT agg = *begin;
@@ -67,16 +62,14 @@ inclusive_inplace(
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn, typename T>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy &,
-    Iter begin,
-    Iter end,
-    BinFn f,
-    T v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive_inplace(resources::Host host_res,
+                  const ExecPolicy &,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  T v)
 {
   using std::distance;
   const auto n = distance(begin, end);
@@ -99,16 +92,14 @@ exclusive_inplace(
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-inclusive(
-    resources::Host host_res,
-    const ExecPolicy &,
-    const Iter begin,
-    const Iter end,
-    OutIter out,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive(resources::Host host_res,
+          const ExecPolicy &,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
   ValueT agg = *begin;
@@ -131,17 +122,15 @@ template <typename ExecPolicy,
           typename OutIter,
           typename BinFn,
           typename T>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-exclusive(
-    resources::Host host_res,
-    const ExecPolicy &,
-    const Iter begin,
-    const Iter end,
-    OutIter out,
-    BinFn f,
-    T v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive(resources::Host host_res,
+          const ExecPolicy &,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f,
+          T v)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
   ValueT agg = v;
diff --git a/include/RAJA/policy/sequential/sort.hpp b/include/RAJA/policy/sequential/sort.hpp
index 98dcf6fc27..322164e1e3 100644
--- a/include/RAJA/policy/sequential/sort.hpp
+++ b/include/RAJA/policy/sequential/sort.hpp
@@ -18,22 +18,17 @@
 #ifndef RAJA_sort_sequential_HPP
 #define RAJA_sort_sequential_HPP
 
-#include "RAJA/config.hpp"
-
 #include <algorithm>
 #include <functional>
 #include <iterator>
 
-#include "RAJA/util/macros.hpp"
-
+#include "RAJA/config.hpp"
+#include "RAJA/policy/sequential/policy.hpp"
 #include "RAJA/util/concepts.hpp"
-
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/sort.hpp"
 #include "RAJA/util/zip.hpp"
 
-#include "RAJA/util/sort.hpp" 
-
-#include "RAJA/policy/sequential/policy.hpp"
-
 namespace RAJA
 {
 namespace impl
@@ -48,11 +43,9 @@ namespace detail
     \brief Functional that performs an unstable sort with the
            given arguments, uses RAJA::intro_sort
 */
-struct UnstableSorter
-{
-  template < typename... Args >
-  RAJA_INLINE
-  void operator()(Args&&... args) const
+struct UnstableSorter {
+  template <typename... Args>
+  RAJA_INLINE void operator()(Args&&... args) const
   {
     RAJA::detail::intro_sort(std::forward<Args>(args)...);
   }
@@ -62,17 +55,15 @@ struct UnstableSorter
     \brief Functional that performs a stable sort with the
            given arguments, calls RAJA::merge_sort
 */
-struct StableSorter
-{
-  template < typename... Args >
-  RAJA_INLINE
-  void operator()(Args&&... args) const
+struct StableSorter {
+  template <typename... Args>
+  RAJA_INLINE void operator()(Args&&... args) const
   {
     RAJA::detail::merge_sort(std::forward<Args>(args)...);
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief sort given range using comparison function
@@ -80,12 +71,11 @@ struct StableSorter
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
   detail::UnstableSorter{}(begin, end, comp);
 
@@ -98,12 +88,11 @@ unstable(
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
   detail::StableSorter{}(begin, end, comp);
 
@@ -113,19 +102,21 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
   auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
   detail::UnstableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
@@ -133,21 +124,24 @@ unstable_pairs(
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on keys
+        \brief stable sort given range of pairs using comparison function on
+   keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
   auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
   detail::StableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
diff --git a/include/RAJA/policy/simd.hpp b/include/RAJA/policy/simd.hpp
index 6cb6cd4c57..1dd77ea1a2 100644
--- a/include/RAJA/policy/simd.hpp
+++ b/include/RAJA/policy/simd.hpp
@@ -20,10 +20,10 @@
 #ifndef RAJA_simd_HPP
 #define RAJA_simd_HPP
 
-#include "RAJA/policy/simd/forall.hpp"
-#include "RAJA/policy/simd/policy.hpp"
 #include "RAJA/policy/sequential/launch.hpp"
+#include "RAJA/policy/simd/forall.hpp"
 #include "RAJA/policy/simd/kernel/For.hpp"
 #include "RAJA/policy/simd/kernel/ForICount.hpp"
+#include "RAJA/policy/simd/policy.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index 8c5b38af9c..ff19d4b8c5 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -26,18 +26,14 @@
 #ifndef RAJA_forall_simd_HPP
 #define RAJA_forall_simd_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iterator>
 #include <type_traits>
 
-#include "RAJA/util/types.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/internal/fault_tolerance.hpp"
-
-#include "RAJA/policy/simd/policy.hpp"
-
 #include "RAJA/pattern/params/forall.hpp"
+#include "RAJA/policy/simd/policy.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -48,12 +44,10 @@ namespace simd
 
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(RAJA::resources::Host host_res,
             const simd_exec &,
             Iterable &&iter,
@@ -75,12 +69,10 @@ forall_impl(RAJA::resources::Host host_res,
 }
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(RAJA::resources::Host host_res,
             const simd_exec &,
             Iterable &&iter,
diff --git a/include/RAJA/policy/simd/kernel/For.hpp b/include/RAJA/policy/simd/kernel/For.hpp
index 53ed45ad1f..1d259a0438 100644
--- a/include/RAJA/policy/simd/kernel/For.hpp
+++ b/include/RAJA/policy/simd/kernel/For.hpp
@@ -18,13 +18,12 @@
 #ifndef RAJA_policy_simd_kernel_For_HPP
 #define RAJA_policy_simd_kernel_For_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
 #include "RAJA/policy/simd/policy.hpp"
 
 namespace RAJA
@@ -44,7 +43,7 @@ struct TypeIsLambda {
   static const bool value = false;
 };
 
-template <camp::idx_t BodyIdx, typename ... Args>
+template <camp::idx_t BodyIdx, typename... Args>
 struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>> {
   static const bool value = true;
 };
@@ -98,7 +97,8 @@ struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
  */
 template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>, Types> {
+    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>,
+    Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &&data)
@@ -118,12 +118,13 @@ struct StatementExecutor<
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
       auto privatizer = thread_privatize(data);
-      auto& private_data = privatizer.get_priv();
+      auto &private_data = privatizer.get_priv();
 
       // Assign offset on privatized data
       private_data.template assign_offset<ArgumentId>(i);
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
+          private_data);
     }
   }
 };
diff --git a/include/RAJA/policy/simd/kernel/ForICount.hpp b/include/RAJA/policy/simd/kernel/ForICount.hpp
index 36a169f2bf..c1ec6b86eb 100644
--- a/include/RAJA/policy/simd/kernel/ForICount.hpp
+++ b/include/RAJA/policy/simd/kernel/ForICount.hpp
@@ -18,15 +18,14 @@
 #ifndef RAJA_policy_simd_kernel_ForICount_HPP
 #define RAJA_policy_simd_kernel_ForICount_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
-#include "RAJA/policy/simd/policy.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
 #include "RAJA/policy/simd/kernel/For.hpp"
+#include "RAJA/policy/simd/policy.hpp"
 
 namespace RAJA
 {
@@ -42,14 +41,17 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId,
-          typename... EnclosedStmts, typename Types>
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, RAJA::simd_exec,
-                         EnclosedStmts...>, Types> {
+    statement::
+        ForICount<ArgumentId, ParamId, RAJA::simd_exec, EnclosedStmts...>,
+    Types> {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
@@ -72,7 +74,8 @@ struct StatementExecutor<
       auto privatizer = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
+          private_data);
     }
   }
 };
@@ -81,4 +84,4 @@ struct StatementExecutor<
 }  // end namespace RAJA
 
 
-#endif 
+#endif
diff --git a/include/RAJA/policy/sycl.hpp b/include/RAJA/policy/sycl.hpp
index 491e39910c..0e77a26115 100644
--- a/include/RAJA/policy/sycl.hpp
+++ b/include/RAJA/policy/sycl.hpp
@@ -24,11 +24,10 @@
 
 #if defined(RAJA_SYCL_ACTIVE)
 
-#include "RAJA/util/sycl_compat.hpp"
-
 #include "RAJA/policy/sycl/forall.hpp"
 #include "RAJA/policy/sycl/policy.hpp"
 #include "RAJA/policy/sycl/reduce.hpp"
+#include "RAJA/util/sycl_compat.hpp"
 //#include "RAJA/policy/sycl/multi_reduce.hpp"
 //#include "RAJA/policy/sycl/scan.hpp"
 //#include "RAJA/policy/sycl/sort.hpp"
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index 1a8c9930dd..bdba5e2fed 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -23,20 +23,18 @@
 
 #if defined(RAJA_ENABLE_SYCL)
 
-#include "RAJA/util/sycl_compat.hpp"
-
 #include <cassert>
 #include <cstddef>
 #include <cstdio>
 #include <type_traits>
 #include <unordered_map>
 
+#include "RAJA/policy/sycl/policy.hpp"
 #include "RAJA/util/basic_mempool.hpp"
 #include "RAJA/util/mutex.hpp"
+#include "RAJA/util/sycl_compat.hpp"
 #include "RAJA/util/types.hpp"
 
-#include "RAJA/policy/sycl/policy.hpp"
-
 namespace RAJA
 {
 
@@ -146,4 +144,3 @@ using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 #endif  // closing endif for RAJA_ENABLE_SYCL
 
 #endif  // closing endif for header file include guard
-
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index 1c6876e328..4a9dc63400 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -29,23 +29,16 @@
 #include <algorithm>
 #include <chrono>
 
-#include "RAJA/util/sycl_compat.hpp"
-
+#include "RAJA/index/IndexSet.hpp"
+#include "RAJA/internal/fault_tolerance.hpp"
 #include "RAJA/pattern/forall.hpp"
-
 #include "RAJA/pattern/params/forall.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/internal/fault_tolerance.hpp"
-
 #include "RAJA/policy/sycl/MemUtils_SYCL.hpp"
 #include "RAJA/policy/sycl/policy.hpp"
-
-#include "RAJA/index/IndexSet.hpp"
-
+#include "RAJA/util/macros.hpp"
 #include "RAJA/util/resource.hpp"
+#include "RAJA/util/sycl_compat.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -85,23 +78,28 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
 {
 
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
@@ -125,35 +123,41 @@ forall_impl(resources::Sycl &sycl_res,
     ::sycl::queue* q = sycl_res.get_queue();
 
     q->submit([&](::sycl::handler& h) {
-
-      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
-                      [=]  (::sycl::nd_item<1> it) {
-
-        IndexType ii = it.get_global_id(0);
-        if (ii < len) {
-          loop_body(begin[ii]);
-        }
-      });
+      h.parallel_for(::sycl::nd_range<1>{gridSize, blockSize},
+                     [=](::sycl::nd_item<1> it) {
+                       IndexType ii = it.get_global_id(0);
+                       if (ii < len) {
+                         loop_body(begin[ii]);
+                       }
+                     });
     });
 
-    if (!Async) { q->wait(); }
+    if (!Async) {
+      q->wait();
+    }
   }
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE 
-resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
-            sycl_exec<BlockSize, Async>,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam)
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},
+                                  bool>::type = true>
+RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
+    resources::Sycl& sycl_res,
+    sycl_exec<BlockSize, Async>,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam)
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
@@ -186,24 +190,22 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
     q->submit([&](::sycl::handler& h) {
+       h.parallel_for(::sycl::nd_range<1>{gridSize, blockSize},
+                      [=](::sycl::nd_item<1> it) {
+                        Index_type ii = it.get_global_id(0);
 
-      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
-                      [=]  (::sycl::nd_item<1> it) {
-
-        Index_type ii = it.get_global_id(0);
-
-        if (ii < len) {
-          (*lbody)((*beg)[ii]);
-        }
-      });
-    }).wait(); // Need to wait for completion to free memory
+                        if (ii < len) {
+                          (*lbody)((*beg)[ii]);
+                        }
+                      });
+     }).wait();  // Need to wait for completion to free memory
 
     // Free our device memory
     ::sycl::free(lbody, *q);
@@ -215,23 +217,29 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t< 
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
 
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
@@ -253,57 +261,61 @@ forall_impl(resources::Sycl &sycl_res,
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = []( ForallParam x, ForallParam y ) {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+    auto combiner = [](ForallParam x, ForallParam y) {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
     q->submit([&](::sycl::handler& h) {
-      h.parallel_for( ::sycl::range<1>(len),
-                      reduction,
-                      [=]   (::sycl::item<1> it, auto & red)  {
-
-        ForallParam fp;
-	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-        IndexType ii = it.get_id(0);
-        if (ii < len) {
-          RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
-        }
-        red.combine(fp);
-      });
+      h.parallel_for(::sycl::range<1>(len),
+                     reduction,
+                     [=](::sycl::item<1> it, auto& red) {
+                       ForallParam fp;
+                       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                       IndexType ii = it.get_id(0);
+                       if (ii < len) {
+                         RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
+                       }
+                       red.combine(fp);
+                     });
     });
 
     q->wait();
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     ::sycl::free(res, *q);
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
-
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t< 
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
 
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
@@ -324,8 +336,8 @@ forall_impl(resources::Sycl &sycl_res,
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = []( ForallParam x, ForallParam y ) {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+    auto combiner = [](ForallParam x, ForallParam y) {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
 
@@ -339,45 +351,40 @@ forall_impl(resources::Sycl &sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
     q->submit([&](::sycl::handler& h) {
-      h.parallel_for( ::sycl::range<1>(len),
+       h.parallel_for(::sycl::range<1>(len),
                       reduction,
-                      [=]   (::sycl::item<1> it, auto & red)  {
-
-
-        Index_type ii = it.get_id(0);
-        ForallParam fp;
-	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-        if (ii < len) {
-          RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
-        }
-        red.combine(fp);
-
-      });
-    }).wait(); // Need to wait for completion to free memory
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
+                      [=](::sycl::item<1> it, auto& red) {
+                        Index_type ii = it.get_id(0);
+                        ForallParam fp;
+                        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                        if (ii < len) {
+                          RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
+                        }
+                        red.combine(fp);
+                      });
+     }).wait();  // Need to wait for completion to free memory
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     // Free our device memory
     ::sycl::free(res, *q);
     ::sycl::free(lbody, *q);
     ::sycl::free(beg, *q);
 
     RAJA_FT_END;
-
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
-
 }
 
 
@@ -403,23 +410,21 @@ template <typename LoopBody,
           size_t BlockSize,
           bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &r,
-                                                    ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
-                                                    const TypedIndexSet<SegmentTypes...>& iset,
-                                                    LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
+    resources::Sycl& r,
+    ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     sycl_exec<BlockSize, true>(),
-                     loop_body);
+    iset.segmentCall(
+        r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(), loop_body);
   }  // iterate over segments of index set
 
-  if ( !Async ) {
+  if (!Async) {
     ::sycl::queue* q = r.get_queue();
-    q->wait(); 
+    q->wait();
   }
 
   return resources::EventProxy<resources::Sycl>(r);
diff --git a/include/RAJA/policy/sycl/kernel.hpp b/include/RAJA/policy/sycl/kernel.hpp
index 641c3a9ef3..cfecb12ed9 100644
--- a/include/RAJA/policy/sycl/kernel.hpp
+++ b/include/RAJA/policy/sycl/kernel.hpp
@@ -20,9 +20,9 @@
 #define RAJA_policy_sycl_kernel_HPP
 
 #include "RAJA/policy/sycl/kernel/Conditional.hpp"
-#include "RAJA/policy/sycl/kernel/SyclKernel.hpp"
 #include "RAJA/policy/sycl/kernel/For.hpp"
 #include "RAJA/policy/sycl/kernel/ForICount.hpp"
+#include "RAJA/policy/sycl/kernel/SyclKernel.hpp"
 //#include "RAJA/policy/sycl/kernel/Hyperplane.hpp"
 //#include "RAJA/policy/sycl/kernel/InitLocalMem.hpp"
 #include "RAJA/policy/sycl/kernel/Lambda.hpp"
diff --git a/include/RAJA/policy/sycl/kernel/Conditional.hpp b/include/RAJA/policy/sycl/kernel/Conditional.hpp
index e2e6b09e6d..33ba8a7e32 100644
--- a/include/RAJA/policy/sycl/kernel/Conditional.hpp
+++ b/include/RAJA/policy/sycl/kernel/Conditional.hpp
@@ -18,17 +18,14 @@
 #ifndef RAJA_policy_sycl_kernel_Conditional_HPP
 #define RAJA_policy_sycl_kernel_Conditional_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/kernel/Conditional.hpp"
-
 #include "RAJA/policy/sycl/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -48,10 +45,9 @@ struct SyclStatementExecutor<Data,
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     if (Conditional::eval(data)) {
 
@@ -61,10 +57,7 @@ struct SyclStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/sycl/kernel/For.hpp b/include/RAJA/policy/sycl/kernel/For.hpp
index 2019bfa7a9..0deb3788b1 100644
--- a/include/RAJA/policy/sycl/kernel/For.hpp
+++ b/include/RAJA/policy/sycl/kernel/For.hpp
@@ -19,7 +19,6 @@
 #define RAJA_policy_sycl_kernel_For_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/policy/sycl/kernel/internal.hpp"
 
 
@@ -45,7 +44,9 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_global_012<Dim, Local_Size>, EnclosedStmts...>,
+    statement::For<ArgumentId,
+                   RAJA::sycl_global_012<Dim, Local_Size>,
+                   EnclosedStmts...>,
     Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -58,22 +59,21 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i = item.get_global_id(Dim);
 
-      // Assign the x thread to the argument
-      data.template assign_offset<ArgumentId>(i);
+    // Assign the x thread to the argument
+    data.template assign_offset<ArgumentId>(i);
 
-      // execute enclosed statements
-      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
@@ -108,10 +108,11 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_group_012_direct<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -123,22 +124,21 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i = item.get_group(Dim);
 
-      // Assign the x thread to the argument
-      data.template assign_offset<ArgumentId>(i);
+    // Assign the x thread to the argument
+    data.template assign_offset<ArgumentId>(i);
 
-      // execute enclosed statements
-      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
@@ -171,10 +171,11 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_group_012_loop<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -186,14 +187,15 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i0 = item.get_group(Dim);
     auto i_stride = item.get_group_range(Dim);
 
-    for(auto i = i0;i < len;i += i_stride){
+    for (auto i = i0; i < len; i += i_stride) {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -203,9 +205,7 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
@@ -213,7 +213,7 @@ struct SyclStatementExecutor<
     LaunchDims dims;
     if (Dim == 0) {
       dims.group.x = len;
-    } 
+    }
     if (Dim == 1) {
       dims.group.y = len;
     }
@@ -237,10 +237,11 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_local_012_direct<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -252,8 +253,9 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i = item.get_local_id(Dim);
@@ -262,13 +264,10 @@ struct SyclStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
-
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
@@ -301,10 +300,11 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_local_012_loop<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -316,15 +316,16 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i0 = item.get_local_id(Dim);
     auto i_stride = item.get_local_range(Dim);
     auto i = i0;
 
-    for(; i < len;i += i_stride){
+    for (; i < len; i += i_stride) {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -333,8 +334,7 @@ struct SyclStatementExecutor<
       enclosed_stmts_t::exec(data, item, thread_active);
     }
     // do we need one more masked iteration?
-    if(i - i0 < len)
-    {
+    if (i - i0 < len) {
       // execute enclosed statements one more time, but masking them off
       // this is because there's at least one thread that isn't masked off
       // that is still executing the above loop
@@ -342,9 +342,7 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
@@ -392,8 +390,7 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item)
+  static inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i = item.get_global_id(0);
@@ -409,9 +406,7 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
@@ -451,17 +446,17 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
 
-    using idx_type = camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
+    using idx_type =
+        camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
 
     idx_type len = segment_length<ArgumentId>(data);
 
-    for(idx_type i = 0;i < len;++ i){
+    for (idx_type i = 0; i < len; ++i) {
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
 
@@ -470,9 +465,7 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
@@ -483,4 +476,4 @@ struct SyclStatementExecutor<
 }  // end namespace RAJA
 
 
-#endif 
+#endif
diff --git a/include/RAJA/policy/sycl/kernel/ForICount.hpp b/include/RAJA/policy/sycl/kernel/ForICount.hpp
index fcd3b1824d..17017e2c51 100644
--- a/include/RAJA/policy/sycl/kernel/ForICount.hpp
+++ b/include/RAJA/policy/sycl/kernel/ForICount.hpp
@@ -20,7 +20,6 @@
 #define RAJA_policy_sycl_kernel_ForICount_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/policy/sycl/kernel/internal.hpp"
 
 
@@ -31,7 +30,6 @@ namespace internal
 {
 
 
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Mapping directly from local id to indices
@@ -46,24 +44,31 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>, Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
+          Types> {
 
   using Base = SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
-        Types>;
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::sycl_local_012_direct<ThreadDim>,
+                     EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
     auto i = item.get_local_id(ThreadDim);
@@ -73,14 +78,11 @@ struct SyclStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
-
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 };
 
 
-
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
@@ -89,39 +91,44 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::sycl_local_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = SyclStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
     auto i0 = item.get_local_id(0);
@@ -132,15 +139,11 @@ struct SyclStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
-
 };
 
 
-
-
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
@@ -149,45 +152,50 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::sycl_local_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = SyclStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // masked size strided loop
     diff_t len = segment_length<ArgumentId>(data);
     auto i0 = item.get_local_id(0);
     diff_t i_init = mask_t::maskValue(i0);
-    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
     for (diff_t ii = 0; ii < len; ii += i_stride) {
@@ -205,13 +213,9 @@ struct SyclStatementExecutor<
       enclosed_stmts_t::exec(data, item, thread_active && have_work);
     }
   }
-
 };
 
 
-
-
-
 /*
  * Executor for thread work sharing loop inside SyclKernel.
  * Provides a block-stride loop (stride of blockDim.xyz) for
@@ -227,23 +231,31 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
-        Types> {
-
-  using Base = SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
-        Types>;
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_012_loop<ThreadDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // block stride loop
     diff_t len = segment_length<ArgumentId>(data);
@@ -269,7 +281,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*
  * Executor for group work sharing inside SyclKernel.
  * Provides a direct mapping of each block in 012.
@@ -284,23 +295,31 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
+          Types> {
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::sycl_group_012_direct<BlockDim>,
+                     EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // grid stride loop
     diff_t len = segment_length<ArgumentId>(data);
@@ -333,23 +352,31 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
-        Types> {
-
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
-      Types>;
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_group_012_loop<BlockDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // grid stride loop
     diff_t len = segment_length<ArgumentId>(data);
@@ -384,26 +411,28 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>, Types >
+    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types > {
+          Data,
+          statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+          Types> {
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types >;
+      statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
-    for(diff_t i = 0;i < len;++ i){
+    for (diff_t i = 0; i < len; ++i) {
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
@@ -415,9 +444,6 @@ struct SyclStatementExecutor<
 };
 
 
-
-
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/sycl/kernel/Lambda.hpp b/include/RAJA/policy/sycl/kernel/Lambda.hpp
index 05f4fb3a44..5e13d81b9c 100644
--- a/include/RAJA/policy/sycl/kernel/Lambda.hpp
+++ b/include/RAJA/policy/sycl/kernel/Lambda.hpp
@@ -26,13 +26,11 @@
 #include <cassert>
 #include <climits>
 
-#include "camp/camp.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
 #include "RAJA/pattern/kernel.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 
 namespace RAJA
@@ -42,22 +40,27 @@ namespace internal
 
 // SyclStatementExecutor for actually invoking the lambda
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct SyclStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
-
-  static
-  inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types> {
+
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active) {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
-
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const &RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index f339bccef5..b237653e3b 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -26,19 +26,15 @@
 #include <cassert>
 #include <climits>
 
-#include "camp/camp.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
 #include "RAJA/pattern/kernel.hpp"
 #include "RAJA/pattern/kernel/For.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
-
 #include "RAJA/policy/sycl/MemUtils_SYCL.hpp"
-#include "RAJA/policy/sycl/policy.hpp"
-
 #include "RAJA/policy/sycl/kernel/internal.hpp"
+#include "RAJA/policy/sycl/policy.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -49,10 +45,10 @@ namespace RAJA
  */
 template <bool async0>
 struct sycl_launch : public RAJA::make_policy_pattern_launch_platform_t<
-                            RAJA::Policy::sycl,
-                            RAJA::Pattern::forall,
-                            detail::get_launch<async0>::value,
-                            RAJA::Platform::sycl>{
+                         RAJA::Policy::sycl,
+                         RAJA::Pattern::forall,
+                         detail::get_launch<async0>::value,
+                         RAJA::Platform::sycl> {
 };
 
 namespace statement
@@ -71,20 +67,16 @@ struct SyclKernelExt
  * The kernel launch is synchronous.
  */
 template <typename... EnclosedStmts>
-using SyclKernel =
-    SyclKernelExt<sycl_launch<false>,
-                  EnclosedStmts...>;
+using SyclKernel = SyclKernelExt<sycl_launch<false>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a SYCL kernel.
  * The kernel launch is asynchronous.
  */
 template <typename... EnclosedStmts>
-using SyclKernelAsync =
-    SyclKernelExt<sycl_launch<true>,
-                  EnclosedStmts...>;
+using SyclKernelAsync = SyclKernelExt<sycl_launch<true>, EnclosedStmts...>;
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
@@ -107,7 +99,11 @@ void SyclKernelLauncher(Data data, ::sycl::nd_item<3> item)
  * Helper class that handles SYCL kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<bool IsTriviallyCopyable, typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <bool IsTriviallyCopyable,
+          typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct SyclLaunchHelper;
 
 /*!
@@ -115,17 +111,17 @@ struct SyclLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the SYCL occupancy calculator.
  */
-template<bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
-{
+template <bool async0, typename StmtList, typename Data, typename Types>
+struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types> {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data &&data,
+  static void launch(Data&& data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      ::sycl::queue* qu)
@@ -136,21 +132,17 @@ struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    data_t* m_data = (data_t*) ::sycl::malloc_device(sizeof(data_t), *qu);
+    data_t* m_data = (data_t*)::sycl::malloc_device(sizeof(data_t), *qu);
     qu->memcpy(m_data, &data, sizeof(data_t)).wait();
 
     qu->submit([&](::sycl::handler& h) {
- 
-      h.parallel_for(launch_dims.fit_nd_range(qu),
-                     [=] (::sycl::nd_item<3> item) {
-        
-        SyclKernelLauncher<Data, executor_t>(*m_data, item);
-
-      });
-    }).wait(); // Need to wait to free memory
+        h.parallel_for(launch_dims.fit_nd_range(qu),
+                       [=](::sycl::nd_item<3> item) {
+                         SyclKernelLauncher<Data, executor_t>(*m_data, item);
+                       });
+      }).wait();  // Need to wait to free memory
 
     ::sycl::free(m_data, *qu);
-
   }
 };
 
@@ -159,34 +151,32 @@ struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the SYCL occupancy calculator.
  */
-template<bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<true,sycl_launch<async0>,StmtList,Data,Types>
-{
+template <bool async0, typename StmtList, typename Data, typename Types>
+struct SyclLaunchHelper<true, sycl_launch<async0>, StmtList, Data, Types> {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data &&data,
+  static void launch(Data&& data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      ::sycl::queue* qu)
   {
 
     qu->submit([&](::sycl::handler& h) {
- 
       h.parallel_for(launch_dims.fit_nd_range(qu),
-                     [=] (::sycl::nd_item<3> item) {
-
-        SyclKernelLauncher<Data, executor_t>(data, item);
-
-      });
+                     [=](::sycl::nd_item<3> item) {
+                       SyclKernelLauncher<Data, executor_t>(data, item);
+                     });
     });
 
-    if (!async) { qu->wait(); };
-
+    if (!async) {
+      qu->wait();
+    };
   }
 };
 
@@ -195,38 +185,42 @@ struct SyclLaunchHelper<true,sycl_launch<async0>,StmtList,Data,Types>
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
       statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data &&data)
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = SyclLaunchHelper<std::is_trivially_copyable<data_t>::value,
-                                      LaunchConfig, stmt_list_t, data_t, Types>;
+                                      LaunchConfig,
+                                      stmt_list_t,
+                                      data_t,
+                                      Types>;
 
     camp::resources::Sycl res = data.get_resource();
-    ::sycl::queue* q = res.get_queue();;
+    ::sycl::queue* q = res.get_queue();
+    ;
 
     //
     // Compute the requested kernel dimensions
     //
     LaunchDims launch_dims = executor_t::calculateDimensions(data);
-    
+
     int shmem = 0;
 
     //
     // Launch the kernels
     //
     launch_t::launch(std::move(data), launch_dims, shmem, q);
-
   }
-
 };
 
 
diff --git a/include/RAJA/policy/sycl/kernel/Tile.hpp b/include/RAJA/policy/sycl/kernel/Tile.hpp
index 59590b2556..e7f4a87f4a 100644
--- a/include/RAJA/policy/sycl/kernel/Tile.hpp
+++ b/include/RAJA/policy/sycl/kernel/Tile.hpp
@@ -1,12 +1,12 @@
- /*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file for SYCL tiled executors.
- *
- ******************************************************************************
- */
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file for SYCL tiled executors.
+*
+******************************************************************************
+*/
 
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
@@ -27,16 +27,14 @@
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/pattern/kernel/Tile.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 #include "camp/camp.hpp"
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/pattern/kernel/Tile.hpp"
-#include "RAJA/pattern/kernel/internal.hpp"
-
 namespace RAJA
 {
 namespace internal
@@ -54,14 +52,17 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
-{
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active){
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
+  {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
@@ -89,9 +90,7 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
 
     // privatize data, so we can mess with the segments
@@ -124,14 +123,12 @@ template <typename Data,
           int BlockDim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_direct<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
-  {
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_direct<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -139,7 +136,9 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -148,8 +147,10 @@ struct SyclStatementExecutor<
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
-    diff_t i = item.get_group(BlockDim) * chunk_size;//get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    // diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    diff_t i =
+        item.get_group(BlockDim) *
+        chunk_size;  // get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
 
     // check have chunk
     if (i < len) {
@@ -169,9 +170,7 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
 
     // Compute how many blocks
@@ -217,13 +216,12 @@ template <typename Data,
           int BlockDim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_loop<BlockDim>,
-                    EnclosedStmts...>, Types>
-  {
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_loop<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -231,7 +229,9 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -242,8 +242,8 @@ struct SyclStatementExecutor<
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    diff_t i_init = item.get_group(BlockDim) * chunk_size; // TODO
-    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size; // TODO
+    diff_t i_init = item.get_group(BlockDim) * chunk_size;          // TODO
+    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size;  // TODO
 
     // Iterate through grid stride of chunks
     for (diff_t i = i_init; i < len; i += i_stride) {
@@ -260,9 +260,7 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
 
     // Compute how many blocks
@@ -276,7 +274,6 @@ struct SyclStatementExecutor<
     set_sycl_dim<BlockDim>(dims.group, num_blocks);
 
 
-
     // privatize data, so we can mess with the segments
     using data_t = camp::decay<Data>;
     data_t private_data = data;
@@ -296,7 +293,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
@@ -306,22 +302,24 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-  Data,
-  statement::Tile<ArgumentId,
-                  RAJA::tile_fixed<chunk_size>,
-                  sycl_local_012_direct<ThreadDim>,
-                  EnclosedStmts ...>, Types>{
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_direct<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -350,15 +348,13 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
 
     // Compute how many blocks
     diff_t len = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if(num_threads * chunk_size < len){
+    if (num_threads * chunk_size < len) {
       num_threads++;
     }
 
@@ -378,9 +374,9 @@ struct SyclStatementExecutor<
 
 
     LaunchDims enclosed_dims =
-      enclosed_stmts_t::calculateDimensions(private_data);
+        enclosed_stmts_t::calculateDimensions(private_data);
 
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -394,22 +390,24 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-  Data,
-  statement::Tile<ArgumentId,
-                  RAJA::tile_fixed<chunk_size>,
-                  sycl_local_012_loop<ThreadDim>,
-                  EnclosedStmts ...>, Types>{
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_loop<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -444,15 +442,13 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
 
     // Compute how many blocks
     diff_t len = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if(num_threads * chunk_size < len){
+    if (num_threads * chunk_size < len) {
       num_threads++;
     }
     num_threads = std::max(num_threads, (diff_t)1);
@@ -473,15 +469,13 @@ struct SyclStatementExecutor<
 
 
     LaunchDims enclosed_dims =
-      enclosed_stmts_t::calculateDimensions(private_data);
+        enclosed_stmts_t::calculateDimensions(private_data);
 
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
 
-
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/sycl/kernel/TileTCount.hpp b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
index f27dafca80..6a4096e5f7 100644
--- a/include/RAJA/policy/sycl/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
@@ -27,16 +27,14 @@
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/pattern/kernel/Tile.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
 #include "camp/camp.hpp"
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/pattern/kernel/Tile.hpp"
-#include "RAJA/pattern/kernel/internal.hpp"
-
 namespace RAJA
 {
 namespace internal
@@ -55,22 +53,26 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types> {
+          Data,
+          statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+          Types> {
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>;
+      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active){
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
+  {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
@@ -114,34 +116,34 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_direct<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_group_012_direct<BlockDim>,
-                        EnclosedStmts...>,
-                        Types> {
-
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      sycl_group_012_direct<BlockDim>,
-                      EnclosedStmts...>,
-                      Types>;
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_direct<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -150,7 +152,7 @@ struct SyclStatementExecutor<
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t t = get_sycl_dim<BlockDim>(blockIdx);
+    // diff_t t = get_sycl_dim<BlockDim>(blockIdx);
     diff_t t = item.get_group(BlockDim);
     diff_t i = t * chunk_size;
 
@@ -187,34 +189,34 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_loop<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_group_012_loop<BlockDim>,
-                        EnclosedStmts...>,
-                        Types> {
-
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      sycl_group_012_loop<BlockDim>,
-                      EnclosedStmts...>,
-                      Types>;
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_loop<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -247,7 +249,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
@@ -258,38 +259,38 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_local_012_direct<ThreadDim>,
-                        EnclosedStmts ...>,
-                        Types>
-  : public SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_local_012_direct<ThreadDim>,
-                    EnclosedStmts ...>,
-                    Types> {
-
-  using Base = SyclStatementExecutor<
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_direct<ThreadDim>,
+                          EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_direct<ThreadDim>,
-                          EnclosedStmts ...>,
-                          Types>;
+                          EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_direct<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -300,7 +301,7 @@ struct SyclStatementExecutor<
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
+    // diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t = item.get_local_id(ThreadDim);
     diff_t i = t * chunk_size;
 
@@ -332,38 +333,38 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_local_012_loop<ThreadDim>,
-                        EnclosedStmts ...>,
-                        Types>
-  : public SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_local_012_loop<ThreadDim>,
-                    EnclosedStmts ...>,
-                    Types> {
-
-  using Base = SyclStatementExecutor<
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_loop<ThreadDim>,
+                          EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_loop<ThreadDim>,
-                          EnclosedStmts ...>,
-                          Types>;
+                          EnclosedStmts...>,
+          Types> {
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_loop<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
     auto &segment = camp::get<ArgumentId>(data.segment_tuple);
@@ -374,15 +375,15 @@ struct SyclStatementExecutor<
 
     // compute trip count
     diff_t len = segment_length<ArgumentId>(data);
-//    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
+    //    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t_init = item.get_local_id(ThreadDim);
     diff_t i_init = t_init * chunk_size;
-//    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
+    //    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
     diff_t t_stride = item.get_local_range(ThreadDim);
     diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index 3498550c25..1f16b643bc 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -26,15 +26,12 @@
 #include <cassert>
 #include <climits>
 
-#include "camp/camp.hpp"
-
 #include "RAJA/pattern/kernel.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
 #include "RAJA/policy/sycl/MemUtils_SYCL.hpp"
 #include "RAJA/policy/sycl/policy.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -52,17 +49,19 @@ struct LaunchDims {
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  LaunchDims() : group{0,0,0},
-                 local{1,1,1},
-                 global{1,1,1},
-                 min_groups{0,0,0},
-                 min_locals{0,0,0} {}
+  LaunchDims()
+      : group{0, 0, 0},
+        local{1, 1, 1},
+        global{1, 1, 1},
+        min_groups{0, 0, 0},
+        min_locals{0, 0, 0}
+  {
+  }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  LaunchDims(LaunchDims const &c) : group(c.group),
-                                    local(c.local),
-                                    global(c.global)
+  LaunchDims(LaunchDims const &c)
+      : group(c.group), local(c.local), global(c.global)
   {
   }
 
@@ -86,48 +85,51 @@ struct LaunchDims {
     return result;
   }
 
-  ::sycl::nd_range<3> fit_nd_range(::sycl::queue* q) {
+  ::sycl::nd_range<3> fit_nd_range(::sycl::queue *q)
+  {
 
     sycl_dim_3_t launch_global;
 
-    sycl_dim_3_t launch_local {1,1,1};
-    launch_local.x = std::max(launch_local.x, local.x); 
+    sycl_dim_3_t launch_local{1, 1, 1};
+    launch_local.x = std::max(launch_local.x, local.x);
     launch_local.y = std::max(launch_local.y, local.y);
     launch_local.z = std::max(launch_local.z, local.z);
 
     ::sycl::device dev = q->get_device();
 
-    auto max_work_group_size = dev.get_info< ::sycl::info::device::max_work_group_size>();
+    auto max_work_group_size =
+        dev.get_info< ::sycl::info::device::max_work_group_size>();
 
-    if(launch_local.x > max_work_group_size) {
+    if (launch_local.x > max_work_group_size) {
       launch_local.x = max_work_group_size;
     }
-    if(launch_local.y > max_work_group_size) {
+    if (launch_local.y > max_work_group_size) {
       launch_local.y = max_work_group_size;
     }
-    if(launch_local.z > max_work_group_size) {
+    if (launch_local.z > max_work_group_size) {
       launch_local.z = max_work_group_size;
     }
 
 
     // Make sure the multiple of locals fits
     // Prefer larger z -> y -> x
-    if(launch_local.x * launch_local.y * launch_local.z > max_work_group_size) {
+    if (launch_local.x * launch_local.y * launch_local.z >
+        max_work_group_size) {
       int remaining = 1;
       // local z cannot be > max_wrk from above
-      // if equal then remaining is 1, on handle < 
-      if(max_work_group_size > launch_local.z) {
+      // if equal then remaining is 1, on handle <
+      if (max_work_group_size > launch_local.z) {
         // keep local z
         remaining = max_work_group_size / launch_local.z;
       }
-      if(remaining >= launch_local.y) {
+      if (remaining >= launch_local.y) {
         // keep local y
         remaining = remaining / launch_local.y;
       } else {
         launch_local.y = remaining;
         remaining = remaining / launch_local.y;
       }
-      if(remaining < launch_local.x) {
+      if (remaining < launch_local.x) {
         launch_local.x = remaining;
       }
     }
@@ -135,33 +137,41 @@ struct LaunchDims {
 
     // User gave group policy, use to calculate global space
     if (group.x != 0 || group.y != 0 || group.z != 0) {
-      sycl_dim_3_t launch_group {1,1,1};
+      sycl_dim_3_t launch_group{1, 1, 1};
       launch_group.x = std::max(launch_group.x, group.x);
       launch_group.y = std::max(launch_group.y, group.y);
       launch_group.z = std::max(launch_group.z, group.z);
 
       launch_global.x = launch_local.x * launch_group.x;
-      launch_global.y = launch_local.y * launch_group.y; 
+      launch_global.y = launch_local.y * launch_group.y;
       launch_global.z = launch_local.z * launch_group.z;
     } else {
-      launch_global.x = launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
-      launch_global.y = launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
-      launch_global.z = launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
+      launch_global.x =
+          launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
+      launch_global.y =
+          launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
+      launch_global.z =
+          launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
     }
 
 
-    if(launch_global.x % launch_local.x != 0) {
-      launch_global.x = ((launch_global.x / launch_local.x) + 1) * launch_local.x; 
+    if (launch_global.x % launch_local.x != 0) {
+      launch_global.x =
+          ((launch_global.x / launch_local.x) + 1) * launch_local.x;
     }
-    if(launch_global.y % launch_local.y != 0) {
-      launch_global.y = ((launch_global.y / launch_local.y) + 1) * launch_local.y; 
+    if (launch_global.y % launch_local.y != 0) {
+      launch_global.y =
+          ((launch_global.y / launch_local.y) + 1) * launch_local.y;
     }
-    if(launch_global.z % launch_local.z != 0) {
-      launch_global.z = ((launch_global.z / launch_local.z) + 1) * launch_local.z; 
+    if (launch_global.z % launch_local.z != 0) {
+      launch_global.z =
+          ((launch_global.z / launch_local.z) + 1) * launch_local.z;
     }
 
     ::sycl::range<3> ret_th = {launch_local.x, launch_local.y, launch_local.z};
-    ::sycl::range<3> ret_gl = {launch_global.x, launch_global.y, launch_global.z};
+    ::sycl::range<3> ret_gl = {launch_global.x,
+                               launch_global.y,
+                               launch_global.z};
 
     return ::sycl::nd_range<3>(ret_gl, ret_th);
   }
@@ -176,7 +186,9 @@ struct SyclStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, item, thread_active);
@@ -230,18 +242,16 @@ struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data,
+                                      ::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Execute statements in order with helper class
-    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, item, thread_active);
+    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, item, thread_active);
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // Compute this statements launch dimensions
     return SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
@@ -250,10 +260,8 @@ struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 };
 
 template <typename StmtList, typename Data, typename Types>
-using sycl_statement_list_executor_t = SyclStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using sycl_statement_list_executor_t =
+    SyclStatementListExecutor<Data, StmtList, Types>;
 
 }  // namespace internal
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index c8bc7aab53..1652c3312d 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -18,10 +18,10 @@
 #ifndef RAJA_pattern_launch_sycl_HPP
 #define RAJA_pattern_launch_sycl_HPP
 
-#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/pattern/detail/privatizer.hpp"
-#include "RAJA/policy/sycl/policy.hpp"
+#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/policy/sycl/MemUtils_SYCL.hpp"
+#include "RAJA/policy/sycl/policy.hpp"
 //#include "RAJA/policy/sycl/raja_syclerrchk.hpp"
 #include "RAJA/util/resource.hpp"
 
@@ -31,79 +31,93 @@ namespace RAJA
 template <bool async>
 struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
- //If the launch lambda is trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  // If the launch lambda is trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
-    ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
+    ::sycl::queue *q = res.get<camp::resources::Sycl>().get_queue();
 
     //
     // Compute the number of blocks and threads
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-				     params.threads.value[1],
-				     params.threads.value[0]);
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
-				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        params.threads.value[2] * params.teams.value[2],
+        params.threads.value[1] * params.teams.value[1],
+        params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
-         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
+        params.threads.value[2] > zero && params.teams.value[0] > zero &&
+        params.teams.value[1] > zero && params.teams.value[2] > zero) {
 
       RAJA_FT_BEGIN;
 
-      q->submit([&](::sycl::handler& h) {
-
-        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
-
-        h.parallel_for
-          (::sycl::nd_range<3>(gridSize, blockSize),
-           [=] (::sycl::nd_item<3> itm) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
-
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+      q->submit([&](::sycl::handler &h) {
+        auto s_vec = ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
 
-            body_in(ctx);
+        h.parallel_for(
+            ::sycl::nd_range<3>(gridSize, blockSize),
+            [=](::sycl::nd_item<3> itm) {
+              LaunchContext ctx;
+              ctx.itm = &itm;
 
-           });
+              // Point to shared memory
+              ctx.shared_mem_ptr =
+                  s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
+              body_in(ctx);
+            });
       });
 
-    if (!async) { q->wait(); }
+      if (!async) {
+        q->wait();
+      }
 
       RAJA_FT_END;
-
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
- //If the launch lambda is trivially copyable and we have explcit reduction parameters
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams launch_reducers)
+  // If the launch lambda is trivially copyable and we have explcit reduction
+  // parameters
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &launch_params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
-    ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
+    ::sycl::queue *q = res.get<camp::resources::Sycl>().get_queue();
 
     using EXEC_POL = RAJA::sycl_launch_t<async, 0>;
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers);
@@ -112,57 +126,60 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-				     launch_params.threads.value[1],
-				     launch_params.threads.value[0]);
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
-				    launch_params.threads.value[1] * launch_params.teams.value[1],
-				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        launch_params.threads.value[2] * launch_params.teams.value[2],
+        launch_params.threads.value[1] * launch_params.teams.value[1],
+        launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
-         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+    if (launch_params.threads.value[0] > zero &&
+        launch_params.threads.value[1] > zero &&
+        launch_params.threads.value[2] > zero &&
+        launch_params.teams.value[0] > zero &&
+        launch_params.teams.value[1] > zero &&
+        launch_params.teams.value[2] > zero) {
 
 
-      auto combiner = []( ReduceParams x, ReduceParams y ) {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+      auto combiner = [](ReduceParams x, ReduceParams y) {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
-       };
+      };
 
       RAJA_FT_BEGIN;
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      ReduceParams *res = ::sycl::malloc_shared<ReduceParams>(1, *q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
-      q->submit([&](::sycl::handler& h) {
-
-       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
-
-        h.parallel_for
-          (::sycl::nd_range<3>(gridSize, blockSize),
-           reduction,
-           [=] (::sycl::nd_item<3> itm, auto & red) {
+      q->submit([&](::sycl::handler &h) {
+         auto s_vec =
+             ::sycl::local_accessor<char, 1>(launch_params.shared_mem_size, h);
 
-            LaunchContext ctx;
-            ctx.itm = &itm;
+         h.parallel_for(
+             ::sycl::nd_range<3>(gridSize, blockSize),
+             reduction,
+             [=](::sycl::nd_item<3> itm, auto &red) {
+               LaunchContext ctx;
+               ctx.itm = &itm;
 
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+               // Point to shared memory
+               ctx.shared_mem_ptr =
+                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
-            ReduceParams fp;
-            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+               ReduceParams fp;
+               RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-            RAJA::expt::invoke_body(fp, body_in, ctx);
+               RAJA::expt::invoke_body(fp, body_in, ctx);
 
-            red.combine(fp);
+               red.combine(fp);
+             });
+       }).wait();  // Need to wait for completion to free memory
 
-           });
-
-      }).wait(); // Need to wait for completion to free memory
-
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
 
       RAJA_FT_END;
@@ -170,92 +187,104 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-   return resources::EventProxy<resources::Resource>(res);
+    return resources::EventProxy<resources::Resource>(res);
   }
 
-  //If the launch lambda is not trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  // If the launch lambda is not trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
-    ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
+    ::sycl::queue *q = res.get<camp::resources::Sycl>().get_queue();
 
     //
     // Compute the number of blocks and threads
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-				     params.threads.value[1],
-				     params.threads.value[0]);
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
-				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        params.threads.value[2] * params.teams.value[2],
+        params.threads.value[1] * params.teams.value[1],
+        params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
-         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
+        params.threads.value[2] > zero && params.teams.value[0] > zero &&
+        params.teams.value[1] > zero && params.teams.value[2] > zero) {
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy to
-      // Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy
+      // to Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
-      LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      LOOP_BODY *lbody;
+      lbody = (LOOP_BODY *)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
-      q->submit([&](::sycl::handler& h) {
-
-        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
+      q->submit([&](::sycl::handler &h) {
+         auto s_vec =
+             ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
 
-        h.parallel_for
-          (::sycl::nd_range<3>(gridSize, blockSize),
-           [=] (::sycl::nd_item<3> itm) {
+         h.parallel_for(
+             ::sycl::nd_range<3>(gridSize, blockSize),
+             [=](::sycl::nd_item<3> itm) {
+               LaunchContext ctx;
+               ctx.itm = &itm;
 
-            LaunchContext ctx;
-            ctx.itm = &itm;
+               // Point to shared memory
+               ctx.shared_mem_ptr =
+                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-            (*lbody)(ctx);
-
-           });
-
-      }).wait(); // Need to wait for completion to free memory
+               (*lbody)(ctx);
+             });
+       }).wait();  // Need to wait for completion to free memory
 
       ::sycl::free(lbody, *q);
 
       RAJA_FT_END;
-
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
 
-  //If the launch lambda is not trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-    exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-         BODY_IN &&body_in, ReduceParams launch_reducers)
+  // If the launch lambda is not trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams &launch_params,
+       const char *kernel_name,
+       BODY_IN &&body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
-    ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
+    ::sycl::queue *q = res.get<camp::resources::Sycl>().get_queue();
 
     using EXEC_POL = RAJA::sycl_launch_t<async, 0>;
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers);
@@ -264,66 +293,69 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-				     launch_params.threads.value[1],
-				     launch_params.threads.value[0]);
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
-				    launch_params.threads.value[1] * launch_params.teams.value[1],
-				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        launch_params.threads.value[2] * launch_params.teams.value[2],
+        launch_params.threads.value[1] * launch_params.teams.value[1],
+        launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
-         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+    if (launch_params.threads.value[0] > zero &&
+        launch_params.threads.value[1] > zero &&
+        launch_params.threads.value[2] > zero &&
+        launch_params.teams.value[0] > zero &&
+        launch_params.teams.value[1] > zero &&
+        launch_params.teams.value[2] > zero) {
 
 
-      auto combiner = []( ReduceParams x, ReduceParams y ) {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+      auto combiner = [](ReduceParams x, ReduceParams y) {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
-       };
+      };
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy to
-      // Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy
+      // to Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
-      LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      LOOP_BODY *lbody;
+      lbody = (LOOP_BODY *)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      ReduceParams *res = ::sycl::malloc_shared<ReduceParams>(1, *q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
-      q->submit([&](::sycl::handler& h) {
-
-       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
-
-        h.parallel_for
-          (::sycl::nd_range<3>(gridSize, blockSize),
-           reduction,
-           [=] (::sycl::nd_item<3> itm, auto & red) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
-
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+      q->submit([&](::sycl::handler &h) {
+         auto s_vec =
+             ::sycl::local_accessor<char, 1>(launch_params.shared_mem_size, h);
 
-            ReduceParams fp;
-            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+         h.parallel_for(
+             ::sycl::nd_range<3>(gridSize, blockSize),
+             reduction,
+             [=](::sycl::nd_item<3> itm, auto &red) {
+               LaunchContext ctx;
+               ctx.itm = &itm;
 
-            RAJA::expt::invoke_body(fp, *lbody, ctx);
+               // Point to shared memory
+               ctx.shared_mem_ptr =
+                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
-            red.combine(fp);
+               ReduceParams fp;
+               RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-           });
+               RAJA::expt::invoke_body(fp, *lbody, ctx);
 
-      }).wait(); // Need to wait for completion to free memory
+               red.combine(fp);
+             });
+       }).wait();  // Need to wait for completion to free memory
 
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
       ::sycl::free(lbody, *q);
 
@@ -332,15 +364,14 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-   return resources::EventProxy<resources::Resource>(res);
+    return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 /*
    SYCL global thread mapping
 */
-template<int ... DIM>
+template <int... DIM>
 struct sycl_global_item;
 
 using sycl_global_item_0 = sycl_global_item<0>;
@@ -351,50 +382,45 @@ template <typename SEGMENT, int DIM>
 struct LoopExecute<sycl_global_item<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
-        ctx.itm->get_local_id(DIM);
+      const int tx = ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
+                     ctx.itm->get_local_id(DIM);
 
       if (tx < len) body(*(segment.begin() + tx));
     }
   }
 };
 
-using sycl_global_item_01 = sycl_global_item<0,1>;
-using sycl_global_item_02 = sycl_global_item<0,2>;
-using sycl_global_item_10 = sycl_global_item<1,0>;
-using sycl_global_item_12 = sycl_global_item<1,2>;
-using sycl_global_item_20 = sycl_global_item<2,0>;
-using sycl_global_item_21 = sycl_global_item<2,1>;
+using sycl_global_item_01 = sycl_global_item<0, 1>;
+using sycl_global_item_02 = sycl_global_item<0, 2>;
+using sycl_global_item_10 = sycl_global_item<1, 0>;
+using sycl_global_item_12 = sycl_global_item<1, 2>;
+using sycl_global_item_20 = sycl_global_item<2, 0>;
+using sycl_global_item_21 = sycl_global_item<2, 1>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment0,
+                                           SEGMENT const &segment1,
+                                           BODY const &body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-        ctx.itm->get_local_id(DIM0);
+      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+                     ctx.itm->get_local_id(DIM0);
 
-      const int ty =
-        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-        ctx.itm->get_local_id(DIM1);
+      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+                     ctx.itm->get_local_id(DIM1);
 
 
       if (tx < len0 && ty < len1)
@@ -404,39 +430,35 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
 };
 
 
-using sycl_global_item_012 = sycl_global_item<0,1,2>;
-using sycl_global_item_021 = sycl_global_item<0,2,1>;
-using sycl_global_item_102 = sycl_global_item<1,0,2>;
-using sycl_global_item_120 = sycl_global_item<1,2,0>;
-using sycl_global_item_201 = sycl_global_item<2,0,1>;
-using sycl_global_item_210 = sycl_global_item<2,1,0>;
+using sycl_global_item_012 = sycl_global_item<0, 1, 2>;
+using sycl_global_item_021 = sycl_global_item<0, 2, 1>;
+using sycl_global_item_102 = sycl_global_item<1, 0, 2>;
+using sycl_global_item_120 = sycl_global_item<1, 2, 0>;
+using sycl_global_item_201 = sycl_global_item<2, 0, 1>;
+using sycl_global_item_210 = sycl_global_item<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment0,
+                                           SEGMENT const &segment1,
+                                           SEGMENT const &segment2,
+                                           BODY const &body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-        ctx.itm->get_local_id(DIM0);
+      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+                     ctx.itm->get_local_id(DIM0);
 
-      const int ty =
-        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-        ctx.itm->get_local_id(DIM1);
+      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+                     ctx.itm->get_local_id(DIM1);
 
-      const int tz =
-        ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
-        ctx.itm->get_local_id(DIM2);
+      const int tz = ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
+                     ctx.itm->get_local_id(DIM2);
 
       if (tx < len0 && ty < len1 && tz < len2)
         body(*(segment0.begin() + tx),
@@ -449,48 +471,66 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
 /*
 Reshape threads in a block into a 1D iteration space
 */
-template<int ... dim>
-struct sycl_flatten_group_local_direct{};
-
-using sycl_flatten_group_local_01_direct = sycl_flatten_group_local_direct<0,1>;
-using sycl_flatten_group_local_02_direct = sycl_flatten_group_local_direct<0,2>;
-using sycl_flatten_group_local_10_direct = sycl_flatten_group_local_direct<1,0>;
-using sycl_flatten_group_local_12_direct = sycl_flatten_group_local_direct<1,2>;
-using sycl_flatten_group_local_20_direct = sycl_flatten_group_local_direct<2,0>;
-using sycl_flatten_group_local_21_direct = sycl_flatten_group_local_direct<2,1>;
-
-using sycl_flatten_group_local_012_direct = sycl_flatten_group_local_direct<0,1,2>;
-using sycl_flatten_group_local_021_direct = sycl_flatten_group_local_direct<0,2,1>;
-using sycl_flatten_group_local_102_direct = sycl_flatten_group_local_direct<1,0,2>;
-using sycl_flatten_group_local_120_direct = sycl_flatten_group_local_direct<1,2,0>;
-using sycl_flatten_group_local_201_direct = sycl_flatten_group_local_direct<2,0,1>;
-using sycl_flatten_group_local_210_direct = sycl_flatten_group_local_direct<2,1,0>;
-
-template<int ... dim>
-struct sycl_flatten_group_local_loop{};
-
-using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0,1>;
-using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0,2>;
-using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1,0>;
-using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1,2>;
-using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2,0>;
-using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2,1>;
-
-using sycl_flatten_group_local_012_loop = sycl_flatten_group_local_loop<0,1,2>;
-using sycl_flatten_group_local_021_loop = sycl_flatten_group_local_loop<0,2,1>;
-using sycl_flatten_group_local_102_loop = sycl_flatten_group_local_loop<1,0,2>;
-using sycl_flatten_group_local_120_loop = sycl_flatten_group_local_loop<1,2,0>;
-using sycl_flatten_group_local_201_loop = sycl_flatten_group_local_loop<2,0,1>;
-using sycl_flatten_group_local_210_loop = sycl_flatten_group_local_loop<2,1,0>;
-
-template<typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1>, SEGMENT>
-{
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template <int... dim>
+struct sycl_flatten_group_local_direct {
+};
+
+using sycl_flatten_group_local_01_direct =
+    sycl_flatten_group_local_direct<0, 1>;
+using sycl_flatten_group_local_02_direct =
+    sycl_flatten_group_local_direct<0, 2>;
+using sycl_flatten_group_local_10_direct =
+    sycl_flatten_group_local_direct<1, 0>;
+using sycl_flatten_group_local_12_direct =
+    sycl_flatten_group_local_direct<1, 2>;
+using sycl_flatten_group_local_20_direct =
+    sycl_flatten_group_local_direct<2, 0>;
+using sycl_flatten_group_local_21_direct =
+    sycl_flatten_group_local_direct<2, 1>;
+
+using sycl_flatten_group_local_012_direct =
+    sycl_flatten_group_local_direct<0, 1, 2>;
+using sycl_flatten_group_local_021_direct =
+    sycl_flatten_group_local_direct<0, 2, 1>;
+using sycl_flatten_group_local_102_direct =
+    sycl_flatten_group_local_direct<1, 0, 2>;
+using sycl_flatten_group_local_120_direct =
+    sycl_flatten_group_local_direct<1, 2, 0>;
+using sycl_flatten_group_local_201_direct =
+    sycl_flatten_group_local_direct<2, 0, 1>;
+using sycl_flatten_group_local_210_direct =
+    sycl_flatten_group_local_direct<2, 1, 0>;
+
+template <int... dim>
+struct sycl_flatten_group_local_loop {
+};
+
+using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0, 1>;
+using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0, 2>;
+using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1, 0>;
+using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1, 2>;
+using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2, 0>;
+using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2, 1>;
+
+using sycl_flatten_group_local_012_loop =
+    sycl_flatten_group_local_loop<0, 1, 2>;
+using sycl_flatten_group_local_021_loop =
+    sycl_flatten_group_local_loop<0, 2, 1>;
+using sycl_flatten_group_local_102_loop =
+    sycl_flatten_group_local_loop<1, 0, 2>;
+using sycl_flatten_group_local_120_loop =
+    sycl_flatten_group_local_loop<1, 2, 0>;
+using sycl_flatten_group_local_201_loop =
+    sycl_flatten_group_local_loop<2, 0, 1>;
+using sycl_flatten_group_local_210_loop =
+    sycl_flatten_group_local_loop<2, 1, 0>;
+
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1>, SEGMENT> {
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -498,21 +538,19 @@ struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1>, SEGMENT>
       const int tx = ctx.itm->get_local_id(DIM0);
       const int ty = ctx.itm->get_local_id(DIM1);
       const int bx = ctx.itm->get_local_range(DIM0);
-      const int tid = tx + bx*ty;
+      const int tid = tx + bx * ty;
 
       if (tid < len) body(*(segment.begin() + tid));
     }
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
-{
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT> {
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -522,21 +560,18 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
     const int bx = ctx.itm->get_local_range(DIM0);
     const int by = ctx.itm->get_local_range(DIM1);
 
-    for(int tid = tx + bx*ty; tid < len; tid += bx*by) {
+    for (int tid = tx + bx * ty; tid < len; tid += bx * by) {
       body(*(segment.begin() + tid));
     }
-
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
-{
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
     const int len = segment.end() - segment.begin();
     {
@@ -546,21 +581,19 @@ struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int bx = ctx.itm->get_local_range(DIM0);
       const int by = ctx.itm->get_local_range(DIM1);
 
-      const int tid = tx + bx*(ty + by*tz);
+      const int tid = tx + bx * (ty + by * tz);
 
       if (tid < len) body(*(segment.begin() + tid));
     }
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
-{
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -571,10 +604,9 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
     const int by = ctx.itm->get_local_range(DIM1);
     const int bz = ctx.itm->get_local_range(DIM2);
 
-    for(int tid = tx + bx*(ty + by*tz); tid < len; tid += bx*by*bz) {
+    for (int tid = tx + bx * (ty + by * tz); tid < len; tid += bx * by * bz) {
       body(*(segment.begin() + tid));
     }
-
   }
 };
 
@@ -585,18 +617,15 @@ template <typename SEGMENT, int DIM>
 struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM);
-         tx < len;
-         tx += ctx.itm->get_local_range(DIM))
-    {
+    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
+         tx += ctx.itm->get_local_range(DIM)) {
       body(*(segment.begin() + tx));
     }
   }
@@ -609,10 +638,9 @@ template <typename SEGMENT, int DIM>
 struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -630,17 +658,15 @@ template <typename SEGMENT, int DIM>
 struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM);
-         bx < len;
-         bx += ctx.itm->get_group_range(DIM) ) {
+    for (int bx = ctx.itm->get_group(DIM); bx < len;
+         bx += ctx.itm->get_group_range(DIM)) {
       body(*(segment.begin() + bx));
     }
   }
@@ -653,10 +679,9 @@ template <typename SEGMENT, int DIM>
 struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -674,18 +699,15 @@ template <typename SEGMENT, int DIM>
 struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM);
-         tx < len;
-         tx += ctx.itm->get_local_range(DIM) )
-    {
+    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
+         tx += ctx.itm->get_local_range(DIM)) {
       body(*(segment.begin() + tx), tx);
     }
   }
@@ -698,10 +720,9 @@ template <typename SEGMENT, int DIM>
 struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -719,17 +740,15 @@ template <typename SEGMENT, int DIM>
 struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx =  ctx.itm->get_group(DIM);
-         bx < len;
-         bx += ctx.itm->get_group_range(DIM) ) {
+    for (int bx = ctx.itm->get_group(DIM); bx < len;
+         bx += ctx.itm->get_group_range(DIM)) {
       body(*(segment.begin() + bx), bx);
     }
   }
@@ -742,10 +761,9 @@ template <typename SEGMENT, int DIM>
 struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -757,29 +775,28 @@ struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 };
 
 // perfectly nested sycl direct policies
-using sycl_group_01_nested_direct = sycl_group_012_direct<0,1>;
-using sycl_group_02_nested_direct = sycl_group_012_direct<0,2>;
-using sycl_group_10_nested_direct = sycl_group_012_direct<1,0>;
-using sycl_group_12_nested_direct = sycl_group_012_direct<1,2>;
-using sycl_group_20_nested_direct = sycl_group_012_direct<2,0>;
-using sycl_group_21_nested_direct = sycl_group_012_direct<2,1>;
-
-using sycl_group_012_nested_direct = sycl_group_012_direct<0,1,2>;
-using sycl_group_021_nested_direct = sycl_group_012_direct<0,2,1>;
-using sycl_group_102_nested_direct = sycl_group_012_direct<1,0,2>;
-using sycl_group_120_nested_direct = sycl_group_012_direct<1,2,0>;
-using sycl_group_201_nested_direct = sycl_group_012_direct<2,0,1>;
-using sycl_group_210_nested_direct = sycl_group_012_direct<2,1,0>;
+using sycl_group_01_nested_direct = sycl_group_012_direct<0, 1>;
+using sycl_group_02_nested_direct = sycl_group_012_direct<0, 2>;
+using sycl_group_10_nested_direct = sycl_group_012_direct<1, 0>;
+using sycl_group_12_nested_direct = sycl_group_012_direct<1, 2>;
+using sycl_group_20_nested_direct = sycl_group_012_direct<2, 0>;
+using sycl_group_21_nested_direct = sycl_group_012_direct<2, 1>;
+
+using sycl_group_012_nested_direct = sycl_group_012_direct<0, 1, 2>;
+using sycl_group_021_nested_direct = sycl_group_012_direct<0, 2, 1>;
+using sycl_group_102_nested_direct = sycl_group_012_direct<1, 0, 2>;
+using sycl_group_120_nested_direct = sycl_group_012_direct<1, 2, 0>;
+using sycl_group_201_nested_direct = sycl_group_012_direct<2, 0, 1>;
+using sycl_group_210_nested_direct = sycl_group_012_direct<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment0,
+                                           SEGMENT const &segment1,
+                                           BODY const &body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -796,12 +813,11 @@ template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment0,
+                                           SEGMENT const &segment1,
+                                           SEGMENT const &segment2,
+                                           BODY const &body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -826,20 +842,18 @@ template <typename SEGMENT, int DIM0, int DIM1>
 struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment0,
+                                           SEGMENT const &segment1,
+                                           BODY const &body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =  ctx.itm->get_group(DIM0);
-      const int ty =  ctx.itm->get_group(DIM1);
+      const int tx = ctx.itm->get_group(DIM0);
+      const int ty = ctx.itm->get_group(DIM1);
       if (tx < len0 && ty < len1)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
-             tx, ty);
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty), tx, ty);
     }
   }
 };
@@ -848,12 +862,11 @@ template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment0,
+                                           SEGMENT const &segment1,
+                                           SEGMENT const &segment2,
+                                           BODY const &body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -865,48 +878,46 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
       if (tx < len0 && ty < len1 && tz < len2)
         body(*(segment0.begin() + tx),
              *(segment1.begin() + ty),
-             *(segment2.begin() + tz), tx, ty, tz);
+             *(segment2.begin() + tz),
+             tx,
+             ty,
+             tz);
     }
   }
 };
 
 // perfectly nested sycl loop policies
-using sycl_group_01_nested_loop = sycl_group_012_loop<0,1>;
-using sycl_group_02_nested_loop = sycl_group_012_loop<0,2>;
-using sycl_group_10_nested_loop = sycl_group_012_loop<1,0>;
-using sycl_group_12_nested_loop = sycl_group_012_loop<1,2>;
-using sycl_group_20_nested_loop = sycl_group_012_loop<2,0>;
-using sycl_group_21_nested_loop = sycl_group_012_loop<2,1>;
-
-using sycl_group_012_nested_loop = sycl_group_012_loop<0,1,2>;
-using sycl_group_021_nested_loop = sycl_group_012_loop<0,2,1>;
-using sycl_group_102_nested_loop = sycl_group_012_loop<1,0,2>;
-using sycl_group_120_nested_loop = sycl_group_012_loop<1,2,0>;
-using sycl_group_201_nested_loop = sycl_group_012_loop<2,0,1>;
-using sycl_group_210_nested_loop = sycl_group_012_loop<2,1,0>;
+using sycl_group_01_nested_loop = sycl_group_012_loop<0, 1>;
+using sycl_group_02_nested_loop = sycl_group_012_loop<0, 2>;
+using sycl_group_10_nested_loop = sycl_group_012_loop<1, 0>;
+using sycl_group_12_nested_loop = sycl_group_012_loop<1, 2>;
+using sycl_group_20_nested_loop = sycl_group_012_loop<2, 0>;
+using sycl_group_21_nested_loop = sycl_group_012_loop<2, 1>;
+
+using sycl_group_012_nested_loop = sycl_group_012_loop<0, 1, 2>;
+using sycl_group_021_nested_loop = sycl_group_012_loop<0, 2, 1>;
+using sycl_group_102_nested_loop = sycl_group_012_loop<1, 0, 2>;
+using sycl_group_120_nested_loop = sycl_group_012_loop<1, 2, 0>;
+using sycl_group_201_nested_loop = sycl_group_012_loop<2, 0, 1>;
+using sycl_group_210_nested_loop = sycl_group_012_loop<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment0,
+                                           SEGMENT const &segment1,
+                                           BODY const &body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0);
-           bx < len0;
-           bx += ctx.itm->get_group_range(DIM0))
-      {
-        for (int by = ctx.itm->get_group(DIM1);
-             by < len1;
-             bx += ctx.itm->get_group_range(DIM1))
-        {
+      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
+           bx += ctx.itm->get_group_range(DIM0)) {
+        for (int by = ctx.itm->get_group(DIM1); by < len1;
+             bx += ctx.itm->get_group_range(DIM1)) {
           body(*(segment0.begin() + bx), *(segment1.begin() + by));
         }
       }
@@ -918,31 +929,24 @@ template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment0,
+                                           SEGMENT const &segment1,
+                                           SEGMENT const &segment2,
+                                           BODY const &body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0);
-         bx < len0;
-         bx += ctx.itm->get_group_range(DIM0))
-    {
+    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
+         bx += ctx.itm->get_group_range(DIM0)) {
 
-      for (int by = ctx.itm->get_group(DIM1);
-           by < len1;
-           by += ctx.itm->get_group_range(DIM1))
-      {
+      for (int by = ctx.itm->get_group(DIM1); by < len1;
+           by += ctx.itm->get_group_range(DIM1)) {
 
-        for (int bz = ctx.itm->get_group(DIM2);
-             bz < len2;
-             bz += ctx.itm->get_group_range(DIM2))
-        {
+        for (int bz = ctx.itm->get_group(DIM2); bz < len2;
+             bz += ctx.itm->get_group_range(DIM2)) {
 
           body(*(segment0.begin() + bx),
                *(segment1.begin() + by),
@@ -960,24 +964,19 @@ template <typename SEGMENT, int DIM0, int DIM1>
 struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment0,
+                                           SEGMENT const &segment1,
+                                           BODY const &body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0);
-           bx < len0;
-           bx += ctx.itm->get_group_range(DIM0))
-      {
-        for (int by = ctx.itm->get_group(DIM0);
-             by < len1;
-             by += ctx.itm->get_group_range(DIM1))
-        {
+      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
+           bx += ctx.itm->get_group_range(DIM0)) {
+        for (int by = ctx.itm->get_group(DIM0); by < len1;
+             by += ctx.itm->get_group_range(DIM1)) {
 
           body(*(segment0.begin() + bx), *(segment1.begin() + by), bx, by);
         }
@@ -990,35 +989,31 @@ template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           SEGMENT const &segment0,
+                                           SEGMENT const &segment1,
+                                           SEGMENT const &segment2,
+                                           BODY const &body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0);
-         bx < len0;
-         bx += ctx.itm->get_group_range(DIM0))
-    {
+    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
+         bx += ctx.itm->get_group_range(DIM0)) {
 
-      for (int by = ctx.itm->get_group(DIM0);
-           by < len1;
-           by += ctx.itm->get_group_range(DIM0))
-      {
+      for (int by = ctx.itm->get_group(DIM0); by < len1;
+           by += ctx.itm->get_group_range(DIM0)) {
 
-        for (int bz =  ctx.itm->get_group(DIM0);
-             bz < len2;
-             bz += ctx.itm->get_group_range(DIM0))
-        {
+        for (int bz = ctx.itm->get_group(DIM0); bz < len2;
+             bz += ctx.itm->get_group_range(DIM0)) {
 
           body(*(segment0.begin() + bx),
                *(segment1.begin() + by),
-               *(segment2.begin() + bz), bx, by, bz);
+               *(segment2.begin() + bz),
+               bx,
+               by,
+               bz);
         }
       }
     }
@@ -1029,19 +1024,16 @@ template <typename SEGMENT, int DIM>
 struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
-         tx < len;
-         tx += ctx.itm->get_local_range(DIM) * tile_size)
-    {
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
+         tx += ctx.itm->get_local_range(DIM) * tile_size) {
       body(segment.slice(tx, tile_size));
     }
   }
@@ -1052,18 +1044,16 @@ template <typename SEGMENT, int DIM>
 struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if(tx < len)
-    {
+    if (tx < len) {
       body(segment.slice(tx, tile_size));
     }
   }
@@ -1074,21 +1064,19 @@ template <typename SEGMENT, int DIM>
 struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_group(DIM)* tile_size;
+    for (int tx = ctx.itm->get_group(DIM) * tile_size;
 
          tx < len;
 
-         tx += ctx.itm->get_group_range(DIM) * tile_size)
-    {
+         tx += ctx.itm->get_group_range(DIM) * tile_size) {
       body(segment.slice(tx, tile_size));
     }
   }
@@ -1098,41 +1086,37 @@ template <typename SEGMENT, int DIM>
 struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_group(DIM) * tile_size;
-    if(tx < len){
+    if (tx < len) {
       body(segment.slice(tx, tile_size));
     }
   }
 };
 
-//Tile execute + return index
+// Tile execute + return index
 template <typename SEGMENT, int DIM>
 struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
-         tx < len;
-         tx += ctx.itm->get_local_range(DIM) * tile_size)
-    {
-      body(segment.slice(tx, tile_size), tx/tile_size);
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
+         tx += ctx.itm->get_local_range(DIM) * tile_size) {
+      body(segment.slice(tx, tile_size), tx / tile_size);
     }
   }
 };
@@ -1142,19 +1126,17 @@ template <typename SEGMENT, int DIM>
 struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if(tx < len)
-    {
-      body(segment.slice(tx, tile_size), tx/tile_size);
+    if (tx < len) {
+      body(segment.slice(tx, tile_size), tx / tile_size);
     }
   }
 };
@@ -1164,20 +1146,17 @@ template <typename SEGMENT, int DIM>
 struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM) * tile_size;
-         bx < len;
-         bx += ctx.itm->get_group_range(DIM) * tile_size)
-    {
-      body(segment.slice(bx, tile_size), bx/tile_size);
+    for (int bx = ctx.itm->get_group(DIM) * tile_size; bx < len;
+         bx += ctx.itm->get_group_range(DIM) * tile_size) {
+      body(segment.slice(bx, tile_size), bx / tile_size);
     }
   }
 };
@@ -1187,18 +1166,17 @@ template <typename SEGMENT, int DIM>
 struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const &ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const &segment,
+                                           BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int bx = ctx.itm->get_group(DIM) * tile_size;
-    if(bx < len){
-      body(segment.slice(bx, tile_size), bx/tile_size);
+    if (bx < len) {
+      body(segment.slice(bx, tile_size), bx / tile_size);
     }
   }
 };
diff --git a/include/RAJA/policy/sycl/params/kernel_name.hpp b/include/RAJA/policy/sycl/params/kernel_name.hpp
index 1f33be19bb..8b0eb88fcd 100644
--- a/include/RAJA/policy/sycl/params/kernel_name.hpp
+++ b/include/RAJA/policy/sycl/params/kernel_name.hpp
@@ -3,39 +3,43 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-#if defined(RAJA_ENABLE_SYCL)  
-  
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  SYCL_EXTERNAL
-  combine(KernelName&, T) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-#endif  
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
-
-
-#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+#if defined(RAJA_ENABLE_SYCL)
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL> > init(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL> > SYCL_EXTERNAL
+combine(KernelName&, T)
+{
+}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL> > resolve(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+#endif
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
+
+
+#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/params/reduce.hpp b/include/RAJA/policy/sycl/params/reduce.hpp
index e2fb7e1a5a..b6734326dd 100644
--- a/include/RAJA/policy/sycl/params/reduce.hpp
+++ b/include/RAJA/policy/sycl/params/reduce.hpp
@@ -3,37 +3,44 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_SYCL)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL> > init(
+    Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL> > combine(
+    Reducer<OP, T, VOp>& out,
+    const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL> > resolve(
+    Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
+#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/policy.hpp b/include/RAJA/policy/sycl/policy.hpp
index a02c9ea30f..75b009c28e 100644
--- a/include/RAJA/policy/sycl/policy.hpp
+++ b/include/RAJA/policy/sycl/policy.hpp
@@ -22,16 +22,14 @@
 
 #if defined(RAJA_SYCL_ACTIVE)
 
-#include "RAJA/util/sycl_compat.hpp"
+#include <cstddef>
 
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/policy/sequential/policy.hpp"
-
 #include "RAJA/util/Operators.hpp"
+#include "RAJA/util/sycl_compat.hpp"
 #include "RAJA/util/types.hpp"
 
-#include <cstddef>
-
 namespace RAJA
 {
 
@@ -78,10 +76,10 @@ struct sycl_exec : public RAJA::make_policy_pattern_launch_platform_t<
 
 template <bool Async, int num_threads = 0>
 struct sycl_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::sycl,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::sycl> {
+                           RAJA::Policy::sycl,
+                           RAJA::Pattern::region,
+                           detail::get_launch<Async>::value,
+                           RAJA::Platform::sycl> {
 };
 
 struct sycl_reduce
@@ -92,8 +90,9 @@ struct sycl_reduce
 // Sycl atomic policy for using sycl atomics on the device and
 // the provided Policy on the host
 //
-template<typename host_policy>
-struct sycl_atomic_explicit{};
+template <typename host_policy>
+struct sycl_atomic_explicit {
+};
 
 //
 // Default sycl atomic policy uses sycl atomics on the device and non-atomics
@@ -101,11 +100,13 @@ struct sycl_atomic_explicit{};
 //
 using sycl_atomic = sycl_atomic_explicit<seq_atomic>;
 
-template<typename Mask>
-struct sycl_local_masked_direct {};
+template <typename Mask>
+struct sycl_local_masked_direct {
+};
 
-template<typename Mask>
-struct sycl_local_masked_loop {};
+template <typename Mask>
+struct sycl_local_masked_loop {
+};
 
 }  // namespace sycl
 }  // namespace policy
@@ -120,27 +121,29 @@ using policy::sycl::sycl_local_masked_direct;
 using policy::sycl::sycl_local_masked_loop;
 
 using policy::sycl::sycl_launch_t;
-  
+
 /*!
  * Maps indices to SYCL global id
  * Optional WORK_GROUP_SIZE to
  */
-template<int dim, int WORK_GROUP_SIZE = 1>
-struct sycl_global_012{};
+template <int dim, int WORK_GROUP_SIZE = 1>
+struct sycl_global_012 {
+};
 
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_0 = sycl_global_012<0, WORK_GROUP_SIZE>;
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_1 = sycl_global_012<1, WORK_GROUP_SIZE>;
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_2 = sycl_global_012<2, WORK_GROUP_SIZE>;
 
 /*!
  * Maps segment indices to SYCL group ids.
  * Loops to allow for any value
  */
-template<int ... dim>
-struct sycl_group_012_loop{};
+template <int... dim>
+struct sycl_group_012_loop {
+};
 
 using sycl_group_0_loop = sycl_group_012_loop<0>;
 using sycl_group_1_loop = sycl_group_012_loop<1>;
@@ -150,8 +153,9 @@ using sycl_group_2_loop = sycl_group_012_loop<2>;
  * Maps segment indices to SYCL local ids.
  * Loops to allow for any value
  */
-template<int ... dim>
-struct sycl_local_012_loop{};
+template <int... dim>
+struct sycl_local_012_loop {
+};
 
 using sycl_local_0_loop = sycl_local_012_loop<0>;
 using sycl_local_1_loop = sycl_local_012_loop<1>;
@@ -160,8 +164,9 @@ using sycl_local_2_loop = sycl_local_012_loop<2>;
 /*!
  * Maps segment indices to SYCL group ids.
  */
-template<int ... dim>
-struct sycl_group_012_direct{};
+template <int... dim>
+struct sycl_group_012_direct {
+};
 
 using sycl_group_0_direct = sycl_group_012_direct<0>;
 using sycl_group_1_direct = sycl_group_012_direct<1>;
@@ -170,102 +175,84 @@ using sycl_group_2_direct = sycl_group_012_direct<2>;
 /*!
  * Maps segment indices to SYCL local ids.
  */
-template<int ... dim>
-struct sycl_local_012_direct{};
+template <int... dim>
+struct sycl_local_012_direct {
+};
 
 using sycl_local_0_direct = sycl_local_012_direct<0>;
 using sycl_local_1_direct = sycl_local_012_direct<1>;
 using sycl_local_2_direct = sycl_local_012_direct<2>;
 
 
-namespace internal{
+namespace internal
+{
 
-template<int dim>
+template <int dim>
 struct SyclDimHelper;
 
-template<>
-struct SyclDimHelper<0>{
+template <>
+struct SyclDimHelper<0> {
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const &d) -> decltype(d.x)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t &d, int value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct SyclDimHelper<1>{
+template <>
+struct SyclDimHelper<1> {
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const &d) -> decltype(d.x)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t &d, int value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct SyclDimHelper<2>{
+template <>
+struct SyclDimHelper<2> {
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const &d) -> decltype(d.x)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t &d, int value)
   {
     d.z = value;
   }
 };
 
-template<int dim, typename dim_t>
-constexpr
-auto get_sycl_dim(dim_t const &d) ->
-  decltype(d.x)
+template <int dim, typename dim_t>
+constexpr auto get_sycl_dim(dim_t const &d) -> decltype(d.x)
 {
   return SyclDimHelper<dim>::get(d);
 }
 
-template<int dim, typename dim_t>
+template <int dim, typename dim_t>
 void set_sycl_dim(dim_t &d, int value)
 {
   return SyclDimHelper<dim>::set(d, value);
 }
-} // namespace internal
+}  // namespace internal
 
 }  // namespace RAJA
 
-#endif // RAJA_ENABLE_SYCL
+#endif  // RAJA_ENABLE_SYCL
 
 #endif
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index 8a84d5b412..4e6c70a164 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -4,7 +4,7 @@
  * \file
  *
  * \brief   Header file for SYCL reduction stucts/classes.
- *          
+ *
  ******************************************************************************
  */
 
@@ -24,12 +24,9 @@
 
 #include <algorithm>
 
-
-#include "RAJA/util/types.hpp"
-
 #include "RAJA/pattern/reduce.hpp"
-
 #include "RAJA/policy/sycl/policy.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -38,8 +35,7 @@ namespace sycl
 {
 
 template <typename T, typename I>
-struct minloc 
-{
+struct minloc {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
   RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
                                                I &loc,
@@ -54,8 +50,7 @@ struct minloc
 };
 
 template <typename T, typename I>
-struct maxloc 
-{
+struct maxloc {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
   RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
                                                I &loc,
@@ -74,8 +69,7 @@ struct maxloc
 static int MaxNumTeams = 1;
 
 //! Information necessary for SYCL offload to be considered
-struct Offload_Info 
-{
+struct Offload_Info {
   int hostID{1};
   int deviceID{2};
   bool isMapped{false};
@@ -91,8 +85,7 @@ struct Offload_Info
 //! Reduction data for SYCL Offload -- stores value, host pointer, and device
 //! pointer
 template <typename T>
-struct Reduce_Data
-{
+struct Reduce_Data {
   mutable T value;
   T *device;
   T *host;
@@ -107,11 +100,13 @@ struct Reduce_Data
   Reduce_Data(T initValue, T identityValue, Offload_Info &info)
       : value(initValue)
   {
-    ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
+    ::sycl::queue *q = ::camp::resources::Sycl::get_default().get_queue();
 
 
-    device = reinterpret_cast<T *>(::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
-    host = reinterpret_cast<T *>(::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
+    device = reinterpret_cast<T *>(
+        ::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
+    host = reinterpret_cast<T *>(
+        ::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
 
     if (!host) {
       printf("Unable to allocate space on host\n");
@@ -125,10 +120,7 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue)
-  {
-    value = initValue;
-  }
+  void reset(T initValue) { value = initValue; }
 
   //! default copy constructor for POD
   Reduce_Data(const Reduce_Data &) = default;
@@ -136,9 +128,9 @@ struct Reduce_Data
   //! transfers from the host to the device -- exit() is called upon failure
   RAJA_INLINE void hostToDevice(Offload_Info &info)
   {
-    ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
+    ::sycl::queue *q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if(!q) {
+    if (!q) {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
     }
@@ -154,25 +146,25 @@ struct Reduce_Data
   //! transfers from the device to the host -- exit() is called upon failure
   RAJA_INLINE void deviceToHost(Offload_Info &info)
   {
-    ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
+    ::sycl::queue *q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if(!q) {
+    if (!q) {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
-    } 
+    }
 
     // precondition: host and device are valid pointers
     auto e = q->memcpy(reinterpret_cast<void *>(host),
                        reinterpret_cast<void *>(device),
                        sycl::MaxNumTeams * sizeof(T));
- 
+
     e.wait();
   }
 
   //! frees all data from the offload information passed
   RAJA_INLINE void cleanup(Offload_Info &info)
   {
-    ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
+    ::sycl::queue *q = ::camp::resources::Sycl::get_default().get_queue();
 
     if (device) {
       ::sycl::free(reinterpret_cast<void *>(device), *q);
@@ -180,7 +172,7 @@ struct Reduce_Data
     }
     if (host) {
       ::sycl::free(reinterpret_cast<void *>(host), *q);
-      //delete[] host;
+      // delete[] host;
       host = nullptr;
     }
   }
@@ -191,8 +183,7 @@ struct Reduce_Data
 //! SYCL Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
 template <typename Reducer, typename T>
-struct TargetReduce 
-{
+struct TargetReduce {
   TargetReduce() = delete;
   TargetReduce(const TargetReduce &) = default;
 
@@ -214,19 +205,17 @@ struct TargetReduce
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduce()
-  {
-  }
+  ~TargetReduce() {}
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
     if (!info.isMapped) {
       val.deviceToHost(info);
-      for (int i =0; i < sycl::MaxNumTeams; ++i) {
+      for (int i = 0; i < sycl::MaxNumTeams; ++i) {
         Reducer{}(val.value, val.host[i]);
       }
-//      val.cleanup(info);
+      //      val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
@@ -243,8 +232,12 @@ struct TargetReduce
   TargetReduce &reduce(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space>(val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T,
+                                  ::sycl::memory_order_acq_rel,
+                                  ::sycl::memory_scope::device,
+                                  ::sycl::access::address_space::global_space>(
+        val.device[i]);
     Reducer{}(atm, rhsVal);
     return *this;
 #else
@@ -257,9 +250,13 @@ struct TargetReduce
   const TargetReduce &reduce(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space>(val.device[i]);
-    Reducer{}(atm, rhsVal);  
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T,
+                                  ::sycl::memory_order_acq_rel,
+                                  ::sycl::memory_scope::device,
+                                  ::sycl::access::address_space::global_space>(
+        val.device[i]);
+    Reducer{}(atm, rhsVal);
     return *this;
 #else
     Reducer{}(val.value, rhsVal);
@@ -281,13 +278,15 @@ struct TargetReduce
 //! SYCL Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
 template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc 
-{
+struct TargetReduceLoc {
   TargetReduceLoc() = delete;
   TargetReduceLoc(const TargetReduceLoc &) = default;
-  explicit TargetReduceLoc(T init_val, IndexType init_loc,
-                           T identity_val_ = Reducer::identity,
-                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  explicit TargetReduceLoc(
+      T init_val,
+      IndexType init_loc,
+      T identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -298,9 +297,11 @@ struct TargetReduceLoc
   {
   }
 
-  void reset(T init_val_, IndexType init_loc_,
+  void reset(T init_val_,
+             IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     val.cleanup(info);
     val = sycl::Reduce_Data<T>(identity_val_, identity_val_, info);
@@ -314,9 +315,7 @@ struct TargetReduceLoc
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduceLoc()
-  {
-  }
+  ~TargetReduceLoc() {}
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
@@ -324,7 +323,7 @@ struct TargetReduceLoc
     if (!info.isMapped) {
       val.deviceToHost(info);
       loc.deviceToHost(info);
-      
+
       for (int i = 0; i < sycl::MaxNumTeams; ++i) {
         Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
       }
@@ -356,10 +355,12 @@ struct TargetReduceLoc
   TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    ::sycl::atomic_fence(::sycl::memory_order_acquire, ::sycl::memory_scope::device);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    ::sycl::atomic_fence(::sycl::memory_order_acquire,
+                         ::sycl::memory_scope::device);
     Reducer{}(val.device[i], loc.device[i], rhsVal, rhsLoc);
-    ::sycl::atomic_fence(::sycl::memory_order_release, ::sycl::memory_scope::device);
+    ::sycl::atomic_fence(::sycl::memory_order_release,
+                         ::sycl::memory_scope::device);
     return *this;
 #else
     Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
@@ -382,7 +383,7 @@ struct TargetReduceLoc
   //! storage for offload information
   sycl::Offload_Info info;
   //! storage for reduction data for value
-//  sycl::Reduce_Data<T> val;
+  //  sycl::Reduce_Data<T> val;
   //! storage for redcution data for location
   T initVal;
   T finalVal;
@@ -395,11 +396,9 @@ struct TargetReduceLoc
 
 //! specialization of ReduceSum for omp_target_reduce
 template <typename T>
-class ReduceSum<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::sum<T>, T>
+class ReduceSum<sycl_reduce, T> : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-
   using self = ReduceSum<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
@@ -415,8 +414,12 @@ class ReduceSum<sycl_reduce, T>
   const self &operator+=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T,
+                                  ::sycl::memory_order_acq_rel,
+                                  ::sycl::memory_scope::device,
+                                  ::sycl::access::address_space::global_space>(
+        parent::val.device[i]);
     atm.fetch_add(rhsVal);
     return *this;
 #else
@@ -432,7 +435,6 @@ class ReduceBitOr<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-
   using self = ReduceBitOr<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
@@ -441,8 +443,12 @@ class ReduceBitOr<sycl_reduce, T>
   self &operator|=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T,
+                                  ::sycl::memory_order_acq_rel,
+                                  ::sycl::memory_scope::device,
+                                  ::sycl::access::address_space::global_space>(
+        parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -455,8 +461,12 @@ class ReduceBitOr<sycl_reduce, T>
   const self &operator|=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T,
+                                  ::sycl::memory_order_acq_rel,
+                                  ::sycl::memory_scope::device,
+                                  ::sycl::access::address_space::global_space>(
+        parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -472,7 +482,6 @@ class ReduceBitAnd<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-
   using self = ReduceBitAnd<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
@@ -481,8 +490,12 @@ class ReduceBitAnd<sycl_reduce, T>
   self &operator&=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T,
+                                  ::sycl::memory_order_acq_rel,
+                                  ::sycl::memory_scope::device,
+                                  ::sycl::access::address_space::global_space>(
+        parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -495,8 +508,12 @@ class ReduceBitAnd<sycl_reduce, T>
   const self &operator&=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T,
+                                  ::sycl::memory_order_acq_rel,
+                                  ::sycl::memory_scope::device,
+                                  ::sycl::access::address_space::global_space>(
+        parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -509,11 +526,9 @@ class ReduceBitAnd<sycl_reduce, T>
 
 //! specialization of ReduceMin for omp_target_reduce
 template <typename T>
-class ReduceMin<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::min<T>, T>
+class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-
   using self = ReduceMin<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
@@ -522,8 +537,12 @@ class ReduceMin<sycl_reduce, T>
   self &min(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T,
+                                  ::sycl::memory_order_acq_rel,
+                                  ::sycl::memory_scope::device,
+                                  ::sycl::access::address_space::global_space>(
+        parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -536,8 +555,12 @@ class ReduceMin<sycl_reduce, T>
   const self &min(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T,
+                                  ::sycl::memory_order_acq_rel,
+                                  ::sycl::memory_scope::device,
+                                  ::sycl::access::address_space::global_space>(
+        parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -550,11 +573,9 @@ class ReduceMin<sycl_reduce, T>
 
 //! specialization of ReduceMax for omp_target_reduce
 template <typename T>
-class ReduceMax<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::max<T>, T>
+class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-
   using self = ReduceMax<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
@@ -563,8 +584,12 @@ class ReduceMax<sycl_reduce, T>
   self &max(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T,
+                                  ::sycl::memory_order_acq_rel,
+                                  ::sycl::memory_scope::device,
+                                  ::sycl::access::address_space::global_space>(
+        parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -577,8 +602,12 @@ class ReduceMax<sycl_reduce, T>
   const self &max(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, ::sycl::memory_order_acq_rel, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T,
+                                  ::sycl::memory_order_acq_rel,
+                                  ::sycl::memory_scope::device,
+                                  ::sycl::access::address_space::global_space>(
+        parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
diff --git a/include/RAJA/policy/tensor/arch.hpp b/include/RAJA/policy/tensor/arch.hpp
index 771adea64f..f7fd687c8a 100644
--- a/include/RAJA/policy/tensor/arch.hpp
+++ b/include/RAJA/policy/tensor/arch.hpp
@@ -23,26 +23,27 @@
 namespace RAJA
 {
 
-namespace internal {
+namespace internal
+{
 
 namespace expt
 {
 
 
-  /*!
-   * Provides architectural details for a given architecture and data type.
-   */
-  template<typename REGISTER_POLICY, typename T>
-  struct RegisterTraits;
-  /*
-   * using element_type = T;
-   * using register_policy = REGISTER_POLICY;
-   * static constexpr camp::idx s_num_bits = X;
-   * static constexpr camp::idx s_num_elem = Y;
-   *
-   */
-} //namespace expt
-} //namespace internal
+/*!
+ * Provides architectural details for a given architecture and data type.
+ */
+template <typename REGISTER_POLICY, typename T>
+struct RegisterTraits;
+/*
+ * using element_type = T;
+ * using register_policy = REGISTER_POLICY;
+ * static constexpr camp::idx s_num_bits = X;
+ * static constexpr camp::idx s_num_elem = Y;
+ *
+ */
+}  // namespace expt
+}  // namespace internal
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -54,7 +55,8 @@ namespace expt
 {
 
 #ifdef __AVX512F__
-struct avx512_register {};
+struct avx512_register {
+};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx512_register
@@ -63,7 +65,8 @@ struct avx512_register {};
 
 
 #ifdef __AVX2__
-struct avx2_register {};
+struct avx2_register {
+};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx2_register
@@ -72,7 +75,8 @@ struct avx2_register {};
 
 
 #ifdef __AVX__
-struct avx_register {};
+struct avx_register {
+};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx_register
@@ -85,7 +89,8 @@ struct avx_register {};
 /*!
  * A CUDA warp distributed vector register
  */
-struct cuda_warp_register {};
+struct cuda_warp_register {
+};
 
 #endif
 
@@ -96,12 +101,14 @@ struct cuda_warp_register {};
  * A HIP wavefront distributed vector register
  * On AMD GPUs this is rally just a vector register
  */
-struct hip_wave_register {};
+struct hip_wave_register {
+};
 
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-struct scalar_register {};
+struct scalar_register {
+};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::scalar_register
@@ -109,13 +116,12 @@ struct scalar_register {};
 #endif
 
 
-  // This sets the default SIMD register that will be used
-  using default_register = RAJA_TENSOR_REGISTER_TYPE;
-
+// This sets the default SIMD register that will be used
+using default_register = RAJA_TENSOR_REGISTER_TYPE;
 
-} // namespace expt
-} // namespace RAJA
 
+}  // namespace expt
+}  // namespace RAJA
 
 
 //
diff --git a/include/RAJA/policy/tensor/arch/avx.hpp b/include/RAJA/policy/tensor/arch/avx.hpp
index ed25f1f3e3..004efbea94 100644
--- a/include/RAJA/policy/tensor/arch/avx.hpp
+++ b/include/RAJA/policy/tensor/arch/avx.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX__
 
-#include<RAJA/policy/tensor/arch/avx/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_float.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_double.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_double.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_float.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx/traits.hpp>
 
 
-#endif // __AVX__
+#endif  // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
index 8a23d66e57..46eebf1b89 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx_double_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -34,444 +35,450 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx_register> :
-    public internal::expt::RegisterBase<Register<double, avx_register>>
+template <>
+class Register<double, avx_register>
+    : public internal::expt::RegisterBase<Register<double, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type = Register<double, avx_register>;
+  using element_type = double;
+  using register_type = __m256d;
+
+  using int_vector_type = Register<int64_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0,
+                             N >= 3 ? -1 : 0,
+                             N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_pd()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : base_type(), m_value(_mm256_set_pd(x3, x2, x1, x0))
+  {
+  }
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    m_value = _mm256_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_pd(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 4; ++i) {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<double, avx_register>;
-      using element_type = double;
-      using register_type = __m256d;
-
-      using int_vector_type = Register<int64_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-                     base_type(), m_value(_mm256_set_pd(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_pd(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 4;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_pd();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        };
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_pd(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply a masked divide, so do it manually
-        return self_type(_mm256_set_pd(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        auto sh1 = _mm256_permute_pd(m_value, 0x5);
-        auto red1 = _mm256_add_pd(m_value, sh1);
-        return red1[0]+red1[2];
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the maximum value of each lane
-        // B = { max{v[0], v[1]},
-        //       max{v[0], v[1]},
-        //       max{v[2], v[3]},
-        //       max{v[2], v[3]} }
-        register_type b = _mm256_max_pd(m_value, a);
-
-        // now take the maximum of a lower and upper halves
-        return RAJA::max<element_type>(b[0], b[2]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[3]},
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper halves
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[2]},   <-- just v[2]
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper lane
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::min();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_pd(m_value, a.m_value));
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the minimum value of each lane
-        // B = { min{v[0], v[1]},
-        //       min{v[0], v[1]},
-        //       min{v[2], v[3]},
-        //       min{v[2], v[3]} }
-        register_type b = _mm256_min_pd(m_value, a);
-
-        // now take the minimum of a lower and upper halves
-        return RAJA::min<element_type>(b[0], b[2]);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[3]},
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper halves
-          return RAJA::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[2]},   <-- just v[2]
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper lane
-          return RAJA::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::min<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::max();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    m_value = _mm256_setzero_pd();
+    for (camp::idx_t i = 0; i < N; ++i) {
+      m_value[i] = ptr[i * stride];
+    };
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    _mm256_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
+  {
+    m_value = _mm256_set1_pd(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(_mm256_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(_mm256_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    return self_type(_mm256_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    return self_type(_mm256_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide, so do it manually
+    return self_type(_mm256_set_pd(N >= 4 ? get(3) / b.get(3) : 0,
+                                   N >= 3 ? get(2) / b.get(2) : 0,
+                                   N >= 2 ? get(1) / b.get(1) : 0,
+                                   N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    auto sh1 = _mm256_permute_pd(m_value, 0x5);
+    auto red1 = _mm256_add_pd(m_value, sh1);
+    return red1[0] + red1[2];
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the maximum value of each lane
+    // B = { max{v[0], v[1]},
+    //       max{v[0], v[1]},
+    //       max{v[2], v[3]},
+    //       max{v[2], v[3]} }
+    register_type b = _mm256_max_pd(m_value, a);
+
+    // now take the maximum of a lower and upper halves
+    return RAJA::max<element_type>(b[0], b[2]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N == 4) {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[3]},
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper halves
+      return RAJA::max<element_type>(b[0], b[2]);
+    } else if (N == 3) {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[2]},   <-- just v[2]
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper lane
+      return RAJA::max<element_type>(b[0], b[2]);
+    } else if (N == 2) {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    } else if (N == 1) {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::min();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_pd(m_value, a.m_value));
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the minimum value of each lane
+    // B = { min{v[0], v[1]},
+    //       min{v[0], v[1]},
+    //       min{v[2], v[3]},
+    //       min{v[2], v[3]} }
+    register_type b = _mm256_min_pd(m_value, a);
+
+    // now take the minimum of a lower and upper halves
+    return RAJA::min<element_type>(b[0], b[2]);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N == 4) {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[3]},
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper halves
+      return RAJA::min<element_type>(b[0], b[2]);
+    } else if (N == 3) {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[2]},   <-- just v[2]
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper lane
+      return RAJA::min<element_type>(b[0], b[2]);
+    } else if (N == 2) {
+      return RAJA::min<element_type>(m_value[0], m_value[1]);
+    } else if (N == 1) {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::max();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX__
+#endif  //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
index 1e6563742a..131fdac1df 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx_float_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -34,457 +35,472 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<float, avx_register> :
-    public internal::expt::RegisterBase<Register<float, avx_register>>
+template <>
+class Register<float, avx_register>
+    : public internal::expt::RegisterBase<Register<float, avx_register>>
+{
+public:
+  using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type = Register<float, avx_register>;
+  using element_type = float;
+  using register_type = __m256;
+
+  using int_vector_type = Register<int32_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0,
+                            N >= 7 ? -1 : 0,
+                            N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0,
+                            N >= 4 ? -1 : 0,
+                            N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0,
+                            N >= 1 ? -1 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_ps()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
+  {
+  }
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    m_value = _mm256_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_ps(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 8; ++i) {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_setzero_ps();
+    for (camp::idx_t i = 0; i < N; ++i) {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    _mm256_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<float, avx_register>;
-      using element_type = float;
-      using register_type = __m256;
-
-      using int_vector_type = Register<int32_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_ps(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 8;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_ps();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX2 does not supply a masked divide
-        return self_type(_mm256_set_ps(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_add_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_add_ps(red1, sh2);
-
-        return red2[0] + red2[4];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // swap odd-even pairs and combine
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // swap odd-even quads and combine
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        // combine quads
-        return RAJA::max<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element of first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::min();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-        if(N == 7){
-          // blend out the 8th lane of the permute
-          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-        }
-
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // Some more simple shortcuts
-        if(N == 3){
-          return RAJA::max<element_type>(red1[0], m_value[2]);
-        }
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        if(N == 4){
-          return red2[0];
-        }
-        if(N == 5){
-          return RAJA::max<element_type>(red2[0], m_value[4]);
-        }
-        if(N == 6){
-          return RAJA::max<element_type>(red2[0], red1[4]);
-        }
-
-        // 7 or 8 lanes
-        return RAJA::max<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // swap odd-even pairs and combine
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // swap odd-even quads and combine
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        // combine quads
-        return RAJA::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::max();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return RAJA::min<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-        if(N == 7){
-          // blend out the 8th lane of the permute
-          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-        }
-
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // Some more simple shortcuts
-        if(N == 3){
-          return RAJA::min<element_type>(red1[0], m_value[2]);
-        }
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        if(N == 4){
-          return red2[0];
-        }
-        if(N == 5){
-          return RAJA::min<element_type>(red2[0], m_value[4]);
-        }
-        if(N == 6){
-          return RAJA::min<element_type>(red2[0], red1[4]);
-        }
-
-        // 7 or 8 lanes
-        return RAJA::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    m_value = _mm256_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(_mm256_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(_mm256_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    return self_type(_mm256_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    return self_type(_mm256_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide
+    return self_type(_mm256_set_ps(N >= 8 ? get(7) / b.get(7) : 0,
+                                   N >= 7 ? get(6) / b.get(6) : 0,
+                                   N >= 6 ? get(5) / b.get(5) : 0,
+                                   N >= 5 ? get(4) / b.get(4) : 0,
+                                   N >= 4 ? get(3) / b.get(3) : 0,
+                                   N >= 3 ? get(2) / b.get(2) : 0,
+                                   N >= 2 ? get(1) / b.get(1) : 0,
+                                   N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_add_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_add_ps(red1, sh2);
+
+    return red2[0] + red2[4];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // swap odd-even pairs and combine
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // swap odd-even quads and combine
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    // combine quads
+    return RAJA::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element of first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8) {
+      return RAJA::operators::limits<float>::min();
+    }
+    if (N == 1) {
+      return m_value[0];
+    }
+    if (N == 2) {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+    if (N == 7) {
+      // blend out the 8th lane of the permute
+      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+    }
+
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // Some more simple shortcuts
+    if (N == 3) {
+      return RAJA::max<element_type>(red1[0], m_value[2]);
+    }
+
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    if (N == 4) {
+      return red2[0];
+    }
+    if (N == 5) {
+      return RAJA::max<element_type>(red2[0], m_value[4]);
+    }
+    if (N == 6) {
+      return RAJA::max<element_type>(red2[0], red1[4]);
+    }
+
+    // 7 or 8 lanes
+    return RAJA::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // swap odd-even pairs and combine
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // swap odd-even quads and combine
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    // combine quads
+    return RAJA::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8) {
+      return RAJA::operators::limits<float>::max();
+    }
+    if (N == 1) {
+      return m_value[0];
+    }
+    if (N == 2) {
+      return RAJA::min<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+    if (N == 7) {
+      // blend out the 8th lane of the permute
+      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+    }
+
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // Some more simple shortcuts
+    if (N == 3) {
+      return RAJA::min<element_type>(red1[0], m_value[2]);
+    }
+
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    if (N == 4) {
+      return red2[0];
+    }
+    if (N == 5) {
+      return RAJA::min<element_type>(red2[0], m_value[4]);
+    }
+    if (N == 6) {
+      return RAJA::min<element_type>(red2[0], red1[4]);
+    }
+
+    // 7 or 8 lanes
+    return RAJA::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX__
+#endif  //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
index 11ab97be16..94725afd9e 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx_int32_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -33,738 +34,774 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int32_t, avx_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx_register>>
+template <>
+class Register<int32_t, avx_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type = Register<int32_t, avx_register>;
+  using element_type = int32_t;
+  using register_type = __m256i;
+
+  using int_vector_type = Register<int32_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0,
+                            N >= 7 ? -1 : 0,
+                            N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0,
+                            N >= 4 ? -1 : 0,
+                            N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0,
+                            N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride,
+                            6 * stride,
+                            5 * stride,
+                            4 * stride,
+                            3 * stride,
+                            2 * stride,
+                            stride,
+                            0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<int32_t, avx_register>;
-      using element_type = int32_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int32_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256((__m256i const *)ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          set(ptr[i], i);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 8;++ i){
-          set(ptr[i*stride], i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          set(ptr[i*stride], i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N), reinterpret_cast<__m256>(m_value));
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi32(m_value, 0);
-          case 1: return _mm256_extract_epi32(m_value, 1);
-          case 2: return _mm256_extract_epi32(m_value, 2);
-          case 3: return _mm256_extract_epi32(m_value, 3);
-          case 4: return _mm256_extract_epi32(m_value, 4);
-          case 5: return _mm256_extract_epi32(m_value, 5);
-          case 6: return _mm256_extract_epi32(m_value, 6);
-          case 7: return _mm256_extract_epi32(m_value, 7);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
-          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
-          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
-          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
-          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        // no 8-way 32-bit add, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_add_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
-
-        // Low 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_sub_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // no 8-way 32-bit multiply, but there is a 32x32 -> 64
-        // This gets ugly :)
-
-        // Low 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        // multiply even lanes 0, 2
-        auto res_low_even = _mm_mul_epi32(low_a, low_b);
-
-        // multiply odd lanes 1, 3
-        auto low_a_sh = _mm_shuffle_epi32(low_a, 0xB1);
-        auto low_b_sh = _mm_shuffle_epi32(low_b, 0xB1);
-        auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
-
-        // recombine to get all 4 lanes
-        // note: AVX doesn't have a int32 blend, so we use the float32 blend
-        res_low_odd = _mm_shuffle_epi32(res_low_odd, 0xB1);
-        auto res_low = _mm256_castsi128_si256(_mm_castps_si128(
-            _mm_blend_ps(_mm_castsi128_ps(res_low_odd),
-                         _mm_castsi128_ps(res_low_even),
-                         0x05)
-            ));
-
-
-        // High 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        // multiply even lanes 0, 2
-        auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
-
-        // multiply odd lanes 1, 3
-        auto hi_a_sh = _mm_shuffle_epi32(hi_a, 0xB1);
-        auto hi_b_sh = _mm_shuffle_epi32(hi_b, 0xB1);
-        auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
-
-        // recombine to get all 4 lanes
-        // note: AVX doesn't have a int32 blend, so we use the float32 blend
-        res_hi_odd = _mm_shuffle_epi32(res_hi_odd, 0xB1);
-        auto res_hi = _mm_castps_si128(
-            _mm_blend_ps(_mm_castsi128_ps(res_hi_odd),
-                         _mm_castsi128_ps(res_hi_even),
-                         0x05)
-            );
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_add_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-        auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_add_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_max_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
-
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract final reduction
-        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::min();
-        }
-
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-        if(N==1){
-          return _mm256_extract_epi32(m_value, 0);
-        }
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0,
+                            N >= 3 ? 2 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0,
+                            N >= 2 ? 1 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
+  {
+  }
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
 
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
 
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_max_epi32(low, low_sh1);
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
 
-        if(N==2){
-          return _mm_extract_epi32(low_red1, 0);
-        }
 
-        if(N==3){
-          // get lane 2 into lane 0
-          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
-          auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
-          return _mm_extract_epi32(low_red1a, 0);
-        }
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    m_value = _mm256_loadu_si256((__m256i const *)ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i) {
+      set(ptr[i], i);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 8; ++i) {
+      set(ptr[i * stride], i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i) {
+      set(ptr[i * stride], i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(reinterpret_cast<float *>(ptr),
+                        createMask(N),
+                        reinterpret_cast<__m256>(m_value));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i) {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i) {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i) {
+      case 0:
+        return _mm256_extract_epi32(m_value, 0);
+      case 1:
+        return _mm256_extract_epi32(m_value, 1);
+      case 2:
+        return _mm256_extract_epi32(m_value, 2);
+      case 3:
+        return _mm256_extract_epi32(m_value, 3);
+      case 4:
+        return _mm256_extract_epi32(m_value, 4);
+      case 5:
+        return _mm256_extract_epi32(m_value, 5);
+      case 6:
+        return _mm256_extract_epi32(m_value, 6);
+      case 7:
+        return _mm256_extract_epi32(m_value, 7);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i) {
+      case 0:
+        m_value = _mm256_insert_epi32(m_value, value, 0);
+        break;
+      case 1:
+        m_value = _mm256_insert_epi32(m_value, value, 1);
+        break;
+      case 2:
+        m_value = _mm256_insert_epi32(m_value, value, 2);
+        break;
+      case 3:
+        m_value = _mm256_insert_epi32(m_value, value, 3);
+        break;
+      case 4:
+        m_value = _mm256_insert_epi32(m_value, value, 4);
+        break;
+      case 5:
+        m_value = _mm256_insert_epi32(m_value, value, 5);
+        break;
+      case 6:
+        m_value = _mm256_insert_epi32(m_value, value, 6);
+        break;
+      case 7:
+        m_value = _mm256_insert_epi32(m_value, value, 7);
+        break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
+  {
+    m_value = _mm256_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    // no 8-way 32-bit add, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
 
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_add_epi32(hi_a, hi_b);
 
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
+
+    // Low 128-bits
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_sub_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    // no 8-way 32-bit multiply, but there is a 32x32 -> 64
+    // This gets ugly :)
+
+    // Low 128-bits
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    // multiply even lanes 0, 2
+    auto res_low_even = _mm_mul_epi32(low_a, low_b);
+
+    // multiply odd lanes 1, 3
+    auto low_a_sh = _mm_shuffle_epi32(low_a, 0xB1);
+    auto low_b_sh = _mm_shuffle_epi32(low_b, 0xB1);
+    auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
+
+    // recombine to get all 4 lanes
+    // note: AVX doesn't have a int32 blend, so we use the float32 blend
+    res_low_odd = _mm_shuffle_epi32(res_low_odd, 0xB1);
+    auto res_low = _mm256_castsi128_si256(_mm_castps_si128(_mm_blend_ps(
+        _mm_castsi128_ps(res_low_odd), _mm_castsi128_ps(res_low_even), 0x05)));
+
+
+    // High 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    // multiply even lanes 0, 2
+    auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
+
+    // multiply odd lanes 1, 3
+    auto hi_a_sh = _mm_shuffle_epi32(hi_a, 0xB1);
+    auto hi_b_sh = _mm_shuffle_epi32(hi_b, 0xB1);
+    auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
+
+    // recombine to get all 4 lanes
+    // note: AVX doesn't have a int32 blend, so we use the float32 blend
+    res_hi_odd = _mm_shuffle_epi32(res_hi_odd, 0xB1);
+    auto res_hi = _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(res_hi_odd),
+                                                _mm_castsi128_ps(res_hi_even),
+                                                0x05));
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(get(7) / b.get(7),
+                                      get(6) / b.get(6),
+                                      get(5) / b.get(5),
+                                      get(4) / b.get(4),
+                                      get(3) / b.get(3),
+                                      get(2) / b.get(2),
+                                      get(1) / b.get(1),
+                                      get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(N >= 8 ? get(7) / b.get(7) : 0,
+                                      N >= 7 ? get(6) / b.get(6) : 0,
+                                      N >= 6 ? get(5) / b.get(5) : 0,
+                                      N >= 5 ? get(4) / b.get(4) : 0,
+                                      N >= 4 ? get(3) / b.get(3) : 0,
+                                      N >= 3 ? get(2) / b.get(2) : 0,
+                                      N >= 2 ? get(1) / b.get(1) : 0,
+                                      N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
 
-        if(N==4){
-          return _mm_extract_epi32(low_red2, 0);
-        }
+    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_add_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+    auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
+
+    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_add_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_max_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+
+    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
+
+    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract final reduction
+    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8) {
+      return RAJA::operators::limits<int32_t>::min();
+    }
+
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+    if (N == 1) {
+      return _mm256_extract_epi32(m_value, 0);
+    }
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_max_epi32(low, low_sh1);
+
+    if (N == 2) {
+      return _mm_extract_epi32(low_red1, 0);
+    }
+
+    if (N == 3) {
+      // get lane 2 into lane 0
+      auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
+      auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
+      return _mm_extract_epi32(low_red1a, 0);
+    }
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+
+    if (N == 4) {
+      return _mm_extract_epi32(low_red2, 0);
+    }
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    if (N == 5) {
+      auto red_5 = _mm_max_epi32(low_red2, hi);
+      return _mm_extract_epi32(red_5, 0);
+    }
+
+    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
+
+    if (N == 6) {
+      auto red_6 = _mm_max_epi32(low_red2, hi_red1);
+      return _mm_extract_epi32(red_6, 0);
+    }
+    if (N == 7) {
+      // get lane 6 (lane 2 of hi) into lane 0
+      auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
+      auto red_7 = _mm_max_epi32(low_red2, hi_red_6);
+      return _mm_extract_epi32(red_7, 0);
+    }
+
+    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type b) const
+  {
+    // no 8-way 32-bit min, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_max_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+
+    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8) {
+      return RAJA::operators::limits<int32_t>::max();
+    }
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+    if (N == 1) {
+      return _mm256_extract_epi32(m_value, 0);
+    }
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+    if (N == 2) {
+      return _mm_extract_epi32(low_red1, 0);
+    }
+
+    if (N == 3) {
+      // get lane 2 into lane 0
+      auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
+      auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
+      return _mm_extract_epi32(low_red1a, 0);
+    }
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+    if (N == 4) {
+      return _mm_extract_epi32(low_red2, 0);
+    }
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    if (N == 5) {
+      auto red_5 = _mm_min_epi32(low_red2, hi);
+      return _mm_extract_epi32(red_5, 0);
+    }
+
+    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+    if (N == 6) {
+      auto red_6 = _mm_min_epi32(low_red2, hi_red1);
+      return _mm_extract_epi32(red_6, 0);
+    }
+    if (N == 7) {
+      // get lane 6 (lane 2 of hi) into lane 0
+      auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
+      auto red_7 = _mm_min_epi32(low_red2, hi_red_6);
+      return _mm_extract_epi32(red_7, 0);
+    }
+
+    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type b) const
+  {
+    // no 8-way 32-bit min, but there is a 4-way... split and conquer
 
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
 
-        if(N==5){
-          auto red_5 = _mm_max_epi32(low_red2, hi);
-          return _mm_extract_epi32(red_5, 0);
-        }
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
-
-        if(N==6){
-          auto red_6 = _mm_max_epi32(low_red2, hi_red1);
-          return _mm_extract_epi32(red_6, 0);
-        }
-        if(N==7){
-          // get lane 6 (lane 2 of hi) into lane 0
-          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
-          auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
-          auto red_7 = _mm_max_epi32(low_red2, hi_red_6);
-          return _mm_extract_epi32(red_7, 0);
-        }
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_min_epi32(hi_a, hi_b);
 
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type b) const
-      {
-        // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_max_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::max();
-        }
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-        if(N==1){
-          return _mm256_extract_epi32(m_value, 0);
-        }
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-        if(N==2){
-          return _mm_extract_epi32(low_red1, 0);
-        }
-
-        if(N==3){
-          // get lane 2 into lane 0
-          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
-          auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
-          return _mm_extract_epi32(low_red1a, 0);
-        }
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-        if(N==4){
-          return _mm_extract_epi32(low_red2, 0);
-        }
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        if(N==5){
-          auto red_5 = _mm_min_epi32(low_red2, hi);
-          return _mm_extract_epi32(red_5, 0);
-        }
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-        if(N==6){
-          auto red_6 = _mm_min_epi32(low_red2, hi_red1);
-          return _mm_extract_epi32(red_6, 0);
-        }
-        if(N==7){
-          // get lane 6 (lane 2 of hi) into lane 0
-          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
-          auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
-          auto red_7 = _mm_min_epi32(low_red2, hi_red_6);
-          return _mm_extract_epi32(red_7, 0);
-        }
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+};
 
 
-        // Sum halves, extract total sum
-        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type b) const
-      {
-        // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_min_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-  };
-
-
-}   // namespace expt
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
index 1c7fae3dc7..9db1130ec6 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx_int64_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -33,506 +34,523 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx_register>>
+template <>
+class Register<int64_t, avx_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type = Register<int64_t, avx_register>;
+  using element_type = int64_t;
+  using register_type = __m256i;
+
+  using int_vector_type = Register<int64_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0,
+                             N >= 3 ? -1 : 0,
+                             N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+  /*
+   * Use the packed-double permute function because there isn't one
+   * specifically for int64
+   *
+   * Just adds a bunch of casting, should be same cost
+   */
+  template <int perm>
+  RAJA_INLINE __m256i permute(__m256i x) const
+  {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
+  {
+  }
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    m_value = _mm256_castpd_si256(
+        _mm256_maskload_pd(reinterpret_cast<double const *>(ptr),
+                           createMask(N)));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 4; ++i) {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i) {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_pd(reinterpret_cast<double *>(ptr),
+                        createMask(N),
+                        reinterpret_cast<__m256d>(m_value));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i) {
+      case 0:
+        return _mm256_extract_epi64(m_value, 0);
+      case 1:
+        return _mm256_extract_epi64(m_value, 1);
+      case 2:
+        return _mm256_extract_epi64(m_value, 2);
+      case 3:
+        return _mm256_extract_epi64(m_value, 3);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i) {
+      case 0:
+        m_value = _mm256_insert_epi64(m_value, value, 0);
+        break;
+      case 1:
+        m_value = _mm256_insert_epi64(m_value, value, 1);
+        break;
+      case 2:
+        m_value = _mm256_insert_epi64(m_value, value, 2);
+        break;
+      case 3:
+        m_value = _mm256_insert_epi64(m_value, value, 3);
+        break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
+  {
+    m_value = _mm256_set1_epi64x(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<int64_t, avx_register>;
-      using element_type = int64_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int64_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-      /*
-       * Use the packed-double permute function because there isn't one
-       * specifically for int64
-       *
-       * Just adds a bunch of casting, should be same cost
-       */
-      template<int perm>
-      RAJA_INLINE
-      __m256i permute(__m256i x) const {
-        return _mm256_castpd_si256(
-            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(),  m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_castpd_si256(
-            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
-        );
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 4;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N), reinterpret_cast<__m256d>(m_value));
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi64(m_value, 0);
-          case 1: return _mm256_extract_epi64(m_value, 1);
-          case 2: return _mm256_extract_epi64(m_value, 2);
-          case 3: return _mm256_extract_epi64(m_value, 3);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi64x(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_sub_epi64(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // AVX2 does not supply an int64_t multiply, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)*b.get(3),
-            get(2)*b.get(2),
-            get(1)*b.get(1),
-            get(0)*b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap pairs and add
-        auto sh1 = permute<0x5>(m_value);
-
-        // Add lower 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(sh1);
-        auto res_low = _mm_add_epi64(low_a, low_b);
-
-        // Add upper 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(sh1, 1);
-        auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-        // Sum upper and lower
-        auto res = _mm_add_epi64(res_hi, res_low);
-
-        // add lower and upper
-        return _mm_extract_epi64(res, 0);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // AVX2 does not supply an 64bit integer max!
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red < v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red < v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red < v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::min();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red < v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red < v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red < v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) > a.get(3) ? get(3) : a.get(3),
-              get(2) > a.get(2) ? get(2) : a.get(2),
-              get(1) > a.get(1) ? get(1) : a.get(1),
-              get(0) > a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red > v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red > v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red > v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::max();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red > v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red > v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red > v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) < a.get(3) ? get(3) : a.get(3),
-              get(2) < a.get(2) ? get(2) : a.get(2),
-              get(1) < a.get(1) ? get(1) : a.get(1),
-              get(0) < a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-  };
-
-
-}   // namespace expt
+    // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_sub_epi64(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    // AVX2 does not supply an int64_t multiply, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3),
+                                       get(2) * b.get(2),
+                                       get(1) * b.get(1),
+                                       get(0) * b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3),
+                                       get(2) / b.get(2),
+                                       get(1) / b.get(1),
+                                       get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(N >= 4 ? get(3) / b.get(3) : 0,
+                                       N >= 3 ? get(2) / b.get(2) : 0,
+                                       N >= 2 ? get(1) / b.get(1) : 0,
+                                       N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap pairs and add
+    auto sh1 = permute<0x5>(m_value);
+
+    // Add lower 128-bits
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(sh1);
+    auto res_low = _mm_add_epi64(low_a, low_b);
+
+    // Add upper 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(sh1, 1);
+    auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+    // Sum upper and lower
+    auto res = _mm_add_epi64(res_hi, res_low);
+
+    // add lower and upper
+    return _mm_extract_epi64(res, 0);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // AVX2 does not supply an 64bit integer max!
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red = red < v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red = red < v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red = red < v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4) {
+      return RAJA::operators::limits<int64_t>::min();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1) {
+      auto v1 = get(1);
+      red = red < v1 ? v1 : red;
+    }
+    if (N > 2) {
+      auto v2 = get(2);
+      red = red < v2 ? v2 : red;
+    }
+    if (N > 3) {
+      auto v3 = get(3);
+      red = red < v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red = red > v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red = red > v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red = red > v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4) {
+      return RAJA::operators::limits<int64_t>::max();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1) {
+      auto v1 = get(1);
+      red = red > v1 ? v1 : red;
+    }
+    if (N > 2) {
+      auto v2 = get(2);
+      red = red > v2 ? v2 : red;
+    }
+    if (N > 3) {
+      auto v3 = get(3);
+      red = red > v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/traits.hpp b/include/RAJA/policy/tensor/arch/avx/traits.hpp
index 33c18e2c5f..51443bcc4f 100644
--- a/include/RAJA/policy/tensor/arch/avx/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/traits.hpp
@@ -20,52 +20,55 @@
 #ifndef RAJA_policy_tensor_arch_avx_traits_HPP
 #define RAJA_policy_tensor_arch_avx_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, int32_t> {
+  using element_type = int32_t;
+  using register_policy = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, int64_t> {
+  using element_type = int64_t;
+  using register_policy = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, float> {
+  using element_type = float;
+  using register_policy = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, double> {
+  using element_type = double;
+  using register_policy = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type = int64_t;
+};
 
-} // namespace intenral
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // __AVX__
+#endif  // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx2.hpp b/include/RAJA/policy/tensor/arch/avx2.hpp
index b462257924..766aaa3b56 100644
--- a/include/RAJA/policy/tensor/arch/avx2.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX2__
 
-#include<RAJA/policy/tensor/arch/avx2/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx2/traits.hpp>
 
 
-#endif // __AVX2__
+#endif  // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
index 852003a4f9..14aabacbde 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx2_double_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -34,529 +35,541 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx2_register> :
-    public internal::expt::RegisterBase<Register<double, avx2_register>>
+template <>
+class Register<double, avx2_register>
+    : public internal::expt::RegisterBase<Register<double, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type = Register<double, avx2_register>;
+  using element_type = double;
+  using register_type = __m256d;
+
+  using int_vector_type = Register<int64_t, avx2_register>;
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0,
+                             N >= 3 ? -1 : 0,
+                             N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_pd()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_pd(x3, x2, x1, x0))
+  {
+  }
+
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<double, avx2_register>;
-      using element_type = double;
-      using register_type = __m256d;
-
-      using int_vector_type = Register<int64_t, avx2_register>;
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_pd(x3,x2,x1,x0))
-      {}
-
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed ++;
+    RAJA::tensor_stats::num_vector_load_packed++;
 #endif
-        m_value = _mm256_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+    m_value = _mm256_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed_n ++;
+    RAJA::tensor_stats::num_vector_load_packed_n++;
 #endif
-        m_value = _mm256_maskload_pd(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+    m_value = _mm256_maskload_pd(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t stride)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided ++;
+    RAJA::tensor_stats::num_vector_load_strided++;
 #endif
-        m_value = _mm256_i64gather_pd(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+    m_value = _mm256_i64gather_pd(ptr,
+                                  createStridedOffsets(stride),
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      _mm256_castsi256_pd(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
+    m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
+                                       ptr,
+                                       createStridedOffsets(stride),
+                                       _mm256_castsi256_pd(createMask(N)),
+                                       sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type &gather(element_type const *ptr, int_vector_type offsets)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_i64gather_pd(ptr,
-                                      offsets.get_register(),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+    m_value =
+        _mm256_i64gather_pd(ptr, offsets.get_register(), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type &gather_n(element_type const *ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                      ptr,
-                                      offsets.get_register(),
-                                      _mm256_castsi256_pd(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
+    m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
+                                       ptr,
+                                       offsets.get_register(),
+                                       _mm256_castsi256_pd(createMask(N)),
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed ++;
+    RAJA::tensor_stats::num_vector_store_packed++;
 #endif
-        _mm256_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+    _mm256_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed_n ++;
+    RAJA::tensor_stats::num_vector_store_packed_n++;
 #endif
-        _mm256_maskstore_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+    _mm256_maskstore_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided ++;
+    RAJA::tensor_stats::num_vector_store_strided++;
 #endif
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+    for (camp::idx_t i = 0; i < 4; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided_n ++;
+    RAJA::tensor_stats::num_vector_store_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_pd(value);
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        switch(i){
-          case 0: return self_type(_mm256_permute4x64_pd (m_value, 0x00));
-          case 1: return self_type(_mm256_permute4x64_pd (m_value, 0x55));
-          case 2: return self_type(_mm256_permute4x64_pd (m_value, 0xAA));
-          case 3: return self_type(_mm256_permute4x64_pd (m_value, 0xFF));
-        }
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply a masked divide, so do it manually
-        return self_type(_mm256_set_pd(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
+    for (camp::idx_t i = 0; i < N; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
+  {
+    m_value = _mm256_set1_pd(value);
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    switch (i) {
+      case 0:
+        return self_type(_mm256_permute4x64_pd(m_value, 0x00));
+      case 1:
+        return self_type(_mm256_permute4x64_pd(m_value, 0x55));
+      case 2:
+        return self_type(_mm256_permute4x64_pd(m_value, 0xAA));
+      case 3:
+        return self_type(_mm256_permute4x64_pd(m_value, 0xFF));
+    }
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(_mm256_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(_mm256_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    return self_type(_mm256_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    return self_type(_mm256_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide, so do it manually
+    return self_type(_mm256_set_pd(N >= 4 ? get(3) / b.get(3) : 0,
+                                   N >= 3 ? get(2) / b.get(2) : 0,
+                                   N >= 2 ? get(1) / b.get(1) : 0,
+                                   N >= 1 ? get(0) / b.get(0) : 0));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const &b, self_type const &c) const
+  {
+    return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const &b, self_type const &c) const
+  {
+    return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum(camp::idx_t = 4) const
-      {
-        auto sh1 = _mm256_permute_pd(m_value, 0x5);
-        auto red1 = _mm256_add_pd(m_value, sh1);
-        return red1[0]+red1[2];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max(camp::idx_t N = 4) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[3]},
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper halves
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[2]},   <-- just v[2]
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper lane
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::min();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_pd(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the minimum value of each lane
-        // B = { min{v[0], v[1]},
-        //       min{v[0], v[1]},
-        //       min{v[2], v[3]},
-        //       min{v[2], v[3]} }
-        register_type b = _mm256_min_pd(m_value, a);
-
-        // now take the minimum of a lower and upper halves
-        return RAJA::min<element_type>(b[0], b[2]);
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[3]},
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper halves
-          return std::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[2]},   <-- just v[2]
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper lane
-          return std::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return std::min<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::max();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum(camp::idx_t = 4) const
+  {
+    auto sh1 = _mm256_permute_pd(m_value, 0x5);
+    auto red1 = _mm256_add_pd(m_value, sh1);
+    return red1[0] + red1[2];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max(camp::idx_t N = 4) const
+  {
+    if (N == 4) {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[3]},
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper halves
+      return RAJA::max<element_type>(b[0], b[2]);
+    } else if (N == 3) {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[2]},   <-- just v[2]
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper lane
+      return RAJA::max<element_type>(b[0], b[2]);
+    } else if (N == 2) {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    } else if (N == 1) {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::min();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_pd(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the minimum value of each lane
+    // B = { min{v[0], v[1]},
+    //       min{v[0], v[1]},
+    //       min{v[2], v[3]},
+    //       min{v[2], v[3]} }
+    register_type b = _mm256_min_pd(m_value, a);
+
+    // now take the minimum of a lower and upper halves
+    return RAJA::min<element_type>(b[0], b[2]);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N == 4) {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[3]},
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper halves
+      return std::min<element_type>(b[0], b[2]);
+    } else if (N == 3) {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[2]},   <-- just v[2]
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper lane
+      return std::min<element_type>(b[0], b[2]);
+    } else if (N == 2) {
+      return std::min<element_type>(m_value[0], m_value[1]);
+    } else if (N == 1) {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::max();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
index 4b1e11419d..b1b531d761 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx2_float_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -33,487 +34,504 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<float, avx2_register> :
-    public internal::expt::RegisterBase<Register<float, avx2_register>>
+template <>
+class Register<float, avx2_register>
+    : public internal::expt::RegisterBase<Register<float, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<float, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type = Register<float, avx2_register>;
+  using element_type = float;
+  using register_type = __m256;
+
+  using int_vector_type = Register<int32_t, avx2_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0,
+                            N >= 7 ? -1 : 0,
+                            N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0,
+                            N >= 4 ? -1 : 0,
+                            N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0,
+                            N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride,
+                            6 * stride,
+                            5 * stride,
+                            4 * stride,
+                            3 * stride,
+                            2 * stride,
+                            stride,
+                            0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0,
+                            N >= 3 ? 2 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0,
+                            N >= 2 ? 1 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_ps()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
+  {
+  }
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    m_value = _mm256_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_ps(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i32gather_ps(ptr,
+                                  createStridedOffsets(stride),
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_mask_i32gather_ps(_mm256_setzero_ps(),
+                                       ptr,
+                                       createStridedOffsets(stride),
+                                       _mm256_castsi256_ps(createMask(N)),
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    _mm256_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
+  {
+    m_value = _mm256_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(_mm256_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(_mm256_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    return self_type(_mm256_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    return self_type(_mm256_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<float, avx2_register>;
-      using element_type = float;
-      using register_type = __m256;
-
-      using int_vector_type = Register<int32_t, avx2_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_ps(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        m_value = _mm256_i32gather_ps(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i32gather_ps(_mm256_setzero_ps(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      _mm256_castsi256_ps(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX2 does not supply a masked divide
-        return self_type(_mm256_set_ps(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
+    // AVX2 does not supply a masked divide
+    return self_type(_mm256_set_ps(N >= 8 ? get(7) / b.get(7) : 0,
+                                   N >= 7 ? get(6) / b.get(6) : 0,
+                                   N >= 6 ? get(5) / b.get(5) : 0,
+                                   N >= 5 ? get(4) / b.get(4) : 0,
+                                   N >= 4 ? get(3) / b.get(3) : 0,
+                                   N >= 3 ? get(2) / b.get(2) : 0,
+                                   N >= 2 ? get(1) / b.get(1) : 0,
+                                   N >= 1 ? get(0) / b.get(0) : 0));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const &b, self_type const &c) const
+  {
+    return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const &b, self_type const &c) const
+  {
+    return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_add_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_add_ps(red1, sh2);
-
-        return red2[0] + red2[4];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        return std::max<element_type>(red2[0], red2[4]);
-
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::min();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return std::max<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        if(N == 3){
-          return std::max<element_type>(red1[0], m_value[2]);
-        }
-        if(N == 4){
-          return std::max<element_type>(red1[0], red1[2]);
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        return std::max<element_type>(red2[0], red2[4]);
-
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        return std::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::max();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return std::min<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        if(N == 3){
-          return std::min<element_type>(red1[0], m_value[2]);
-        }
-        if(N == 4){
-          return std::min<element_type>(red1[0], red1[2]);
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        return std::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_add_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_add_ps(red1, sh2);
+
+    return red2[0] + red2[4];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    return std::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8) {
+      return RAJA::operators::limits<float>::min();
+    }
+    if (N == 1) {
+      return m_value[0];
+    }
+    if (N == 2) {
+      return std::max<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    if (N == 3) {
+      return std::max<element_type>(red1[0], m_value[2]);
+    }
+    if (N == 4) {
+      return std::max<element_type>(red1[0], red1[2]);
+    }
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    return std::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    return std::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8) {
+      return RAJA::operators::limits<float>::max();
+    }
+    if (N == 1) {
+      return m_value[0];
+    }
+    if (N == 2) {
+      return std::min<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    if (N == 3) {
+      return std::min<element_type>(red1[0], m_value[2]);
+    }
+    if (N == 4) {
+      return std::min<element_type>(red1[0], red1[2]);
+    }
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    return std::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
index ab5948a3f2..e1accb4619 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx2_int32_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -34,535 +35,582 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<int32_t, avx2_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
+template <>
+class Register<int32_t, avx2_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type = Register<int32_t, avx2_register>;
+  using element_type = int32_t;
+  using register_type = __m256i;
+
+  using int_vector_type = Register<int32_t, avx2_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0,
+                            N >= 7 ? -1 : 0,
+                            N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0,
+                            N >= 4 ? -1 : 0,
+                            N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0,
+                            N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride,
+                            6 * stride,
+                            5 * stride,
+                            4 * stride,
+                            3 * stride,
+                            2 * stride,
+                            stride,
+                            0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0,
+                            N >= 3 ? 2 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0,
+                            N >= 2 ? 1 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
+  {
+  }
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    m_value = _mm256_loadu_si256((__m256i const *)ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_epi32(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i32gather_epi32(ptr,
+                                     createStridedOffsets(stride),
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(),
+                                          ptr,
+                                          createStridedOffsets(stride),
+                                          createMask(N),
+                                          sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_epi32(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i) {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i) {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i) {
+      case 0:
+        return _mm256_extract_epi32(m_value, 0);
+      case 1:
+        return _mm256_extract_epi32(m_value, 1);
+      case 2:
+        return _mm256_extract_epi32(m_value, 2);
+      case 3:
+        return _mm256_extract_epi32(m_value, 3);
+      case 4:
+        return _mm256_extract_epi32(m_value, 4);
+      case 5:
+        return _mm256_extract_epi32(m_value, 5);
+      case 6:
+        return _mm256_extract_epi32(m_value, 6);
+      case 7:
+        return _mm256_extract_epi32(m_value, 7);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i) {
+      case 0:
+        m_value = _mm256_insert_epi32(m_value, value, 0);
+        break;
+      case 1:
+        m_value = _mm256_insert_epi32(m_value, value, 1);
+        break;
+      case 2:
+        m_value = _mm256_insert_epi32(m_value, value, 2);
+        break;
+      case 3:
+        m_value = _mm256_insert_epi32(m_value, value, 3);
+        break;
+      case 4:
+        m_value = _mm256_insert_epi32(m_value, value, 4);
+        break;
+      case 5:
+        m_value = _mm256_insert_epi32(m_value, value, 5);
+        break;
+      case 6:
+        m_value = _mm256_insert_epi32(m_value, value, 6);
+        break;
+      case 7:
+        m_value = _mm256_insert_epi32(m_value, value, 7);
+        break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
+  {
+    m_value = _mm256_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(_mm256_add_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(_mm256_sub_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<int32_t, avx2_register>;
-      using element_type = int32_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int32_t, avx2_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256((__m256i const *)ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_epi32(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        m_value = _mm256_i32gather_epi32(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_epi32(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi32(m_value, 0);
-          case 1: return _mm256_extract_epi32(m_value, 1);
-          case 2: return _mm256_extract_epi32(m_value, 2);
-          case 3: return _mm256_extract_epi32(m_value, 3);
-          case 4: return _mm256_extract_epi32(m_value, 4);
-          case 5: return _mm256_extract_epi32(m_value, 5);
-          case 6: return _mm256_extract_epi32(m_value, 6);
-          case 7: return _mm256_extract_epi32(m_value, 7);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
-          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
-          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
-          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
-          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-
-        // the AVX2 epi32 multiply only multiplies the even elements
-        // and provides 64-bit results
-        // need to do some repacking to get this to work
-
-        // multiply 0, 2, 4, 6
-        auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
-
-        // Swap 32-bit words
-        auto sh_a = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
-
-        auto sh_b = _mm256_castps_si256(
-                    _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
-
-        // multiply 1, 3, 5, 7
-        auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
-
-        // Stitch prod_odd and prod_even back together
-        auto sh_odd = _mm256_castps_si256(
-                    _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
-
-        return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1) );
-        auto red1 = _mm256_add_epi32(m_value, sh1);
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
-        auto red2 = _mm256_add_epi32(red1, sh2);
-
-        return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-        auto red1 = _mm256_max_epi32(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-        auto red2 = _mm256_max_epi32(red1, sh2);
-
-        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::min();
-        }
-        if(N == 1){
-          return get(0);
-        }
-
-        if(N == 2){
-          return std::max<element_type>(get(0), get(1));
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-        auto red1 = _mm256_max_epi32(m_value, sh1);
-
-        if(N == 3){
-          return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-        }
-        if(N == 4){
-          return std::max<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-        auto red2 = _mm256_max_epi32(red1, sh2);
-
-        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_epi32(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-        auto red1 = _mm256_min_epi32(m_value, sh1);
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-        auto red2 = _mm256_min_epi32(red1, sh2);
-
-        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::max();
-        }
-        if(N == 1){
-          return get(0);
-        }
-
-        if(N == 2){
-          return std::min<element_type>(get(0), get(1));
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-        auto red1 = _mm256_min_epi32(m_value, sh1);
-
-        if(N == 3){
-          return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-        }
-        if(N == 4){
-          return std::min<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-        auto red2 = _mm256_min_epi32(red1, sh2);
-
-        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_epi32(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+
+    // the AVX2 epi32 multiply only multiplies the even elements
+    // and provides 64-bit results
+    // need to do some repacking to get this to work
+
+    // multiply 0, 2, 4, 6
+    auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
+
+    // Swap 32-bit words
+    auto sh_a = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
+
+    auto sh_b = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
+
+    // multiply 1, 3, 5, 7
+    auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
+
+    // Stitch prod_odd and prod_even back together
+    auto sh_odd = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
+
+    return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(get(7) / b.get(7),
+                                      get(6) / b.get(6),
+                                      get(5) / b.get(5),
+                                      get(4) / b.get(4),
+                                      get(3) / b.get(3),
+                                      get(2) / b.get(2),
+                                      get(1) / b.get(1),
+                                      get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(N >= 8 ? get(7) / b.get(7) : 0,
+                                      N >= 7 ? get(6) / b.get(6) : 0,
+                                      N >= 6 ? get(5) / b.get(5) : 0,
+                                      N >= 5 ? get(4) / b.get(4) : 0,
+                                      N >= 4 ? get(3) / b.get(3) : 0,
+                                      N >= 3 ? get(2) / b.get(2) : 0,
+                                      N >= 2 ? get(1) / b.get(1) : 0,
+                                      N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
+    auto red1 = _mm256_add_epi32(m_value, sh1);
+
+
+    // swap odd-even quads and add
+    auto sh2 =
+        _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
+    auto red2 = _mm256_add_epi32(red1, sh2);
+
+    return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto red1 = _mm256_max_epi32(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto red2 = _mm256_max_epi32(red1, sh2);
+
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8) {
+      return RAJA::operators::limits<int32_t>::min();
+    }
+    if (N == 1) {
+      return get(0);
+    }
+
+    if (N == 2) {
+      return std::max<element_type>(get(0), get(1));
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto red1 = _mm256_max_epi32(m_value, sh1);
+
+    if (N == 3) {
+      return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+    }
+    if (N == 4) {
+      return std::max<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
+    }
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto red2 = _mm256_max_epi32(red1, sh2);
+
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_epi32(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto red1 = _mm256_min_epi32(m_value, sh1);
+
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto red2 = _mm256_min_epi32(red1, sh2);
+
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8) {
+      return RAJA::operators::limits<int32_t>::max();
+    }
+    if (N == 1) {
+      return get(0);
+    }
+
+    if (N == 2) {
+      return std::min<element_type>(get(0), get(1));
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto red1 = _mm256_min_epi32(m_value, sh1);
+
+    if (N == 3) {
+      return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+    }
+    if (N == 4) {
+      return std::min<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
+    }
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto red2 = _mm256_min_epi32(red1, sh2);
+
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_epi32(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
index 00eea542cd..8ee33d5e84 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx2_int64_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -33,519 +34,540 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx2_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
+template <>
+class Register<int64_t, avx2_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type = Register<int64_t, avx2_register>;
+  using element_type = int64_t;
+  using register_type = __m256i;
+
+  using int_vector_type = Register<int64_t, avx2_register>;
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0,
+                             N >= 3 ? -1 : 0,
+                             N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+  /*
+   * Use the packed-double permute function because there isn't one
+   * specifically for int64
+   *
+   * Just adds a bunch of casting, should be same cost
+   */
+  template <int perm>
+  RAJA_INLINE __m256i permute(__m256i x) const
+  {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
+  {
+  }
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    m_value = _mm256_castpd_si256(
+        _mm256_maskload_pd(reinterpret_cast<double const *>(ptr),
+                           createMask(N)));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(int64_t const *ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
+                                     createStridedOffsets(stride),
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value =
+        _mm256_mask_i64gather_epi64(_mm256_set1_epi64x(0),
+                                    reinterpret_cast<long long const *>(ptr),
+                                    createStridedOffsets(stride),
+                                    createMask(N),
+                                    sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type &gather(element_type const *ptr, int_vector_type offsets)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<int64_t, avx2_register>;
-      using element_type = int64_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int64_t, avx2_register>;
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-      /*
-       * Use the packed-double permute function because there isn't one
-       * specifically for int64
-       *
-       * Just adds a bunch of casting, should be same cost
-       */
-      template<int perm>
-      RAJA_INLINE
-      __m256i permute(__m256i x) const {
-        return _mm256_castpd_si256(
-            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_castpd_si256(
-            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
-        );
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(int64_t const *ptr, camp::idx_t stride){
-        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i64gather_epi64(_mm256_set1_epi64x(0),
-                                      reinterpret_cast<long long const *>(ptr),
-                                      createStridedOffsets(stride),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
-                                      offsets.get_register(),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+    m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
+                                     offsets.get_register(),
+                                     sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type &gather_n(element_type const *ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),
-                                      reinterpret_cast<long long const *>(ptr),
-                                      offsets.get_register(),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi64(m_value, 0);
-          case 1: return _mm256_extract_epi64(m_value, 1);
-          case 2: return _mm256_extract_epi64(m_value, 2);
-          case 3: return _mm256_extract_epi64(m_value, 3);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi64x(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // AVX2 does not supply an int64_t multiply, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)*b.get(3),
-            get(2)*b.get(2),
-            get(1)*b.get(1),
-            get(0)*b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-
-        // swap pairs and add
-        auto sh1 = permute<0x5>(m_value);
-        auto red1 = _mm256_add_epi64(m_value, sh1);
-
-        // add lower and upper
-        return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red < v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red < v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red < v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::min();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red < v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red < v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red < v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) > a.get(3) ? get(3) : a.get(3),
-              get(2) > a.get(2) ? get(2) : a.get(2),
-              get(1) > a.get(1) ? get(1) : a.get(1),
-              get(0) > a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red > v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red > v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red > v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::max();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red > v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red > v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red > v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) < a.get(3) ? get(3) : a.get(3),
-              get(2) < a.get(2) ? get(2) : a.get(2),
-              get(1) < a.get(1) ? get(1) : a.get(1),
-              get(0) < a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-  };
-
-
-}   // namespace expt
+    m_value =
+        _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),
+                                    reinterpret_cast<long long const *>(ptr),
+                                    offsets.get_register(),
+                                    createMask(N),
+                                    sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_epi64(reinterpret_cast<long long *>(ptr),
+                           createMask(N),
+                           m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i) {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i) {
+      case 0:
+        return _mm256_extract_epi64(m_value, 0);
+      case 1:
+        return _mm256_extract_epi64(m_value, 1);
+      case 2:
+        return _mm256_extract_epi64(m_value, 2);
+      case 3:
+        return _mm256_extract_epi64(m_value, 3);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i) {
+      case 0:
+        m_value = _mm256_insert_epi64(m_value, value, 0);
+        break;
+      case 1:
+        m_value = _mm256_insert_epi64(m_value, value, 1);
+        break;
+      case 2:
+        m_value = _mm256_insert_epi64(m_value, value, 2);
+        break;
+      case 3:
+        m_value = _mm256_insert_epi64(m_value, value, 3);
+        break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
+  {
+    m_value = _mm256_set1_epi64x(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(_mm256_add_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(_mm256_sub_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    // AVX2 does not supply an int64_t multiply, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3),
+                                       get(2) * b.get(2),
+                                       get(1) * b.get(1),
+                                       get(0) * b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3),
+                                       get(2) / b.get(2),
+                                       get(1) / b.get(1),
+                                       get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(N >= 4 ? get(3) / b.get(3) : 0,
+                                       N >= 3 ? get(2) / b.get(2) : 0,
+                                       N >= 2 ? get(1) / b.get(1) : 0,
+                                       N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+
+    // swap pairs and add
+    auto sh1 = permute<0x5>(m_value);
+    auto red1 = _mm256_add_epi64(m_value, sh1);
+
+    // add lower and upper
+    return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red = red < v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red = red < v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red = red < v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4) {
+      return RAJA::operators::limits<int64_t>::min();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1) {
+      auto v1 = get(1);
+      red = red < v1 ? v1 : red;
+    }
+    if (N > 2) {
+      auto v2 = get(2);
+      red = red < v2 ? v2 : red;
+    }
+    if (N > 3) {
+      auto v3 = get(3);
+      red = red < v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red = red > v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red = red > v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red = red > v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4) {
+      return RAJA::operators::limits<int64_t>::max();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1) {
+      auto v1 = get(1);
+      red = red > v1 ? v1 : red;
+    }
+    if (N > 2) {
+      auto v2 = get(2);
+      red = red > v2 ? v2 : red;
+    }
+    if (N > 3) {
+      auto v3 = get(3);
+      red = red > v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/traits.hpp b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
index e95c661335..6b17262e6c 100644
--- a/include/RAJA/policy/tensor/arch/avx2/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
@@ -21,55 +21,56 @@
 #define RAJA_policy_tensor_arch_avx2_traits_HPP
 
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
-
-} // namespace intenral
-} // namespace expt
-} // namespace RAJA
-
-
-#endif // guard
-
-
-
-#endif // __AVX2__
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, int32_t> {
+  using element_type = int32_t;
+  using register_policy = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int32_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, int64_t> {
+  using element_type = int64_t;
+  using register_policy = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type = int64_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, float> {
+  using element_type = float;
+  using register_policy = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int32_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, double> {
+  using element_type = double;
+  using register_policy = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type = int64_t;
+};
+
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
+
+
+#endif  // guard
+
+
+#endif  // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx512.hpp b/include/RAJA/policy/tensor/arch/avx512.hpp
index 597563da35..d111e278a7 100644
--- a/include/RAJA/policy/tensor/arch/avx512.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512.hpp
@@ -18,11 +18,11 @@
 // Check if the base AVX512 instructions are present
 #ifdef __AVX512F__
 
-#include<RAJA/policy/tensor/arch/avx512/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx512/traits.hpp>
 
 
-#endif // __AVX512F__
+#endif  // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
index a7b7ebaafa..692a9e2390 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx512_double_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -34,360 +35,380 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx512_register> :
-    public internal::expt::RegisterBase<Register<double, avx512_register>>
+template <>
+class Register<double, avx512_register>
+    : public internal::expt::RegisterBase<Register<double, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx512_register>>;
+
+
+  using register_policy = avx512_register;
+  using self_type = Register<double, avx512_register>;
+  using element_type = double;
+  using register_type = __m512d;
+
+  using int_vector_type = Register<int64_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask8 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N) {
+      case 0:
+        return __mmask8(0x00);
+      case 1:
+        return __mmask8(0x01);
+      case 2:
+        return __mmask8(0x03);
+      case 3:
+        return __mmask8(0x07);
+      case 4:
+        return __mmask8(0x0F);
+      case 5:
+        return __mmask8(0x1F);
+      case 6:
+        return __mmask8(0x3F);
+      case 7:
+        return __mmask8(0x7F);
+      case 8:
+        return __mmask8(0xFF);
+    }
+    return __mmask8(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi64(stride);
+    auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi64(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_pd()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const &c) : base_type(), m_value(_mm512_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    // AVX512F
+    m_value = _mm512_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i64gather_pd(createStridedOffsets(stride),
+                                  ptr,
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(),
+                                       createMask(N),
+                                       createStridedOffsets(stride),
+                                       ptr,
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    // AVX512F
+    _mm512_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i64scatter_pd(ptr,
+                         createStridedOffsets(stride),
+                         m_value,
+                         sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i64scatter_pd(ptr,
+                              createMask(N),
+                              createStridedOffsets(stride),
+                              m_value,
+                              sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx512_register>>;
-
-
-      using register_policy = avx512_register;
-      using self_type = Register<double, avx512_register>;
-      using element_type = double;
-      using register_type = __m512d;
-
-      using int_vector_type = Register<int64_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask8 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0: return __mmask8(0x00);
-					case 1: return __mmask8(0x01);
-					case 2: return __mmask8(0x03);
-					case 3: return __mmask8(0x07);
-					case 4: return __mmask8(0x0F);
-					case 5: return __mmask8(0x1F);
-					case 6: return __mmask8(0x3F);
-					case 7: return __mmask8(0x7F);
-					case 8: return __mmask8(0xFF);
-				}
-				return __mmask8(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi64(stride);
-				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi64(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_pd(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        m_value = _mm512_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i64gather_pd(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        _mm512_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i64scatter_pd(ptr, 
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i64scatter_pd(ptr, 
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_pd(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm512_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
-      }
+    m_value = _mm512_set1_pd(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(_mm512_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(_mm512_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    return self_type(_mm512_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    return self_type(_mm512_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
+  {
+    return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const &b, self_type const &c) const
+  {
+    return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const &b, self_type const &c) const
+  {
+    return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_pd(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_pd(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_pd(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_pd(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_pd(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_pd(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
index 84cb034a56..45c1ed6651 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx512_float_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -33,367 +34,396 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<float, avx512_register> :
-    public internal::expt::RegisterBase<Register<float, avx512_register>>
+template <>
+class Register<float, avx512_register>
+    : public internal::expt::RegisterBase<Register<float, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<float, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type = Register<float, avx512_register>;
+  using element_type = float;
+  using register_type = __m512;
+
+  using int_vector_type = Register<int32_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask16 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N) {
+      case 0:
+        return __mmask16(0x0000);
+      case 1:
+        return __mmask16(0x0001);
+      case 2:
+        return __mmask16(0x0003);
+      case 3:
+        return __mmask16(0x0007);
+      case 4:
+        return __mmask16(0x000F);
+      case 5:
+        return __mmask16(0x001F);
+      case 6:
+        return __mmask16(0x003F);
+      case 7:
+        return __mmask16(0x007F);
+      case 8:
+        return __mmask16(0x00FF);
+      case 9:
+        return __mmask16(0x01FF);
+      case 10:
+        return __mmask16(0x03FF);
+      case 11:
+        return __mmask16(0x07FF);
+      case 12:
+        return __mmask16(0x0FFF);
+      case 13:
+        return __mmask16(0x1FFF);
+      case 14:
+        return __mmask16(0x3FFF);
+      case 15:
+        return __mmask16(0x7FFF);
+      case 16:
+        return __mmask16(0xFFFF);
+    }
+    return __mmask16(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi32(stride);
+    auto vseq =
+        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi32(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 16;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_ps()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const &c) : base_type(), m_value(_mm512_set1_ps(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    // AVX512F
+    m_value = _mm512_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i32gather_ps(createStridedOffsets(stride),
+                                  ptr,
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(),
+                                       createMask(N),
+                                       createStridedOffsets(stride),
+                                       ptr,
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    // AVX512F
+    _mm512_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i32scatter_ps(ptr,
+                         createStridedOffsets(stride),
+                         m_value,
+                         sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i32scatter_ps(ptr,
+                              createMask(N),
+                              createStridedOffsets(stride),
+                              m_value,
+                              sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<float, avx512_register>;
-      using element_type = float;
-      using register_type = __m512;
-
-      using int_vector_type = Register<int32_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask16 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0:  return __mmask16(0x0000);
-					case 1:  return __mmask16(0x0001);
-					case 2:  return __mmask16(0x0003);
-					case 3:  return __mmask16(0x0007);
-					case 4:  return __mmask16(0x000F);
-					case 5:  return __mmask16(0x001F);
-					case 6:  return __mmask16(0x003F);
-					case 7:  return __mmask16(0x007F);
-					case 8:  return __mmask16(0x00FF);
-          case 9:  return __mmask16(0x01FF);
-          case 10: return __mmask16(0x03FF);
-          case 11: return __mmask16(0x07FF);
-          case 12: return __mmask16(0x0FFF);
-          case 13: return __mmask16(0x1FFF);
-          case 14: return __mmask16(0x3FFF);
-          case 15: return __mmask16(0x7FFF);
-          case 16: return __mmask16(0xFFFF);
-				}
-				return __mmask16(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi32(stride);
-				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi32(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 16;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_ps(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        m_value = _mm512_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i32gather_ps(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        _mm512_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i32scatter_ps(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i32scatter_ps(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm512_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
-      }
+    m_value = _mm512_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(_mm512_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(_mm512_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    return self_type(_mm512_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    return self_type(_mm512_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
+  {
+    return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const &b, self_type const &c) const
+  {
+    return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const &b, self_type const &c) const
+  {
+    return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_ps(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_ps(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_ps(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_ps(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
index 021ca90fbe..eca9162fc3 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx512_int32_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -33,419 +34,464 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int32_t, avx512_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
+template <>
+class Register<int32_t, avx512_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type = Register<int32_t, avx512_register>;
+  using element_type = int32_t;
+  using register_type = __m512i;
+
+  using int_vector_type = Register<int32_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask16 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N) {
+      case 0:
+        return __mmask16(0x0000);
+      case 1:
+        return __mmask16(0x0001);
+      case 2:
+        return __mmask16(0x0003);
+      case 3:
+        return __mmask16(0x0007);
+      case 4:
+        return __mmask16(0x000F);
+      case 5:
+        return __mmask16(0x001F);
+      case 6:
+        return __mmask16(0x003F);
+      case 7:
+        return __mmask16(0x007F);
+      case 8:
+        return __mmask16(0x00FF);
+      case 9:
+        return __mmask16(0x01FF);
+      case 10:
+        return __mmask16(0x03FF);
+      case 11:
+        return __mmask16(0x07FF);
+      case 12:
+        return __mmask16(0x0FFF);
+      case 13:
+        return __mmask16(0x1FFF);
+      case 14:
+        return __mmask16(0x3FFF);
+      case 15:
+        return __mmask16(0x7FFF);
+      case 16:
+        return __mmask16(0xFFFF);
+    }
+    return __mmask16(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi32(stride);
+    auto vseq =
+        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi32(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 16;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi32(c))
+  {
+  }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    // AVX512F
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+    m_value = _mm512_loadu_si512(ptr);
+#else
+    m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value =
+        _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i32gather_epi32(createStridedOffsets(stride),
+                                     ptr,
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(),
+                                          createMask(N),
+                                          createStridedOffsets(stride),
+                                          ptr,
+                                          sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    // AVX512F
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+    _mm512_storeu_si512(ptr, m_value);
+#else
+    _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i32scatter_epi32(ptr,
+                            createStridedOffsets(stride),
+                            m_value,
+                            sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<int32_t, avx512_register>;
-      using element_type = int32_t;
-      using register_type = __m512i;
-
-      using int_vector_type = Register<int32_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask16 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0:  return __mmask16(0x0000);
-					case 1:  return __mmask16(0x0001);
-					case 2:  return __mmask16(0x0003);
-					case 3:  return __mmask16(0x0007);
-					case 4:  return __mmask16(0x000F);
-					case 5:  return __mmask16(0x001F);
-					case 6:  return __mmask16(0x003F);
-					case 7:  return __mmask16(0x007F);
-					case 8:  return __mmask16(0x00FF);
-          case 9:  return __mmask16(0x01FF);
-          case 10: return __mmask16(0x03FF);
-          case 11: return __mmask16(0x07FF);
-          case 12: return __mmask16(0x0FFF);
-          case 13: return __mmask16(0x1FFF);
-          case 14: return __mmask16(0x3FFF);
-          case 15: return __mmask16(0x7FFF);
-          case 16: return __mmask16(0xFFFF);
-				}
-				return __mmask16(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi32(stride);
-				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi32(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 16;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi32(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-        m_value = _mm512_loadu_si512(ptr);
-        #else
-        m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i32gather_epi32(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-        _mm512_storeu_si512(ptr, m_value);
-        #else
-        _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i32scatter_epi32(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i32scatter_epi32(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // GNU 7-10 are missing this instruction.
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
-        #define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
-        #endif
-
-				switch(i){	
-					case 0: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
-					case 1: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
-					case 2: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
-					case 3: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
-					case 4: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
-					case 5: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
-					case 6: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
-					case 7: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
-					case 8: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
-					case 9: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
-					case 10: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
-					case 11: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
-					case 12: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
-					case 13: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
-					case 14: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
-					case 15: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
-				}
-				return 0;
-			}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-				m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mullo_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi32(
-            get(15)/b.get(15),
-            get(14)/b.get(14),
-            get(13)/b.get(13),
-            get(12)/b.get(12),
-            get(11)/b.get(11),
-            get(10)/b.get(10),
-            get(9)/b.get(9),
-            get(8)/b.get(8),
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi32(
-            N >= 16 ? get(15)/b.get(15) : 0,
-            N >= 15 ? get(14)/b.get(14) : 0,
-            N >= 14 ? get(13)/b.get(13) : 0,
-            N >= 13 ? get(12)/b.get(12) : 0,
-            N >= 12 ? get(11)/b.get(11) : 0,
-            N >= 11 ? get(10)/b.get(10) : 0,
-            N >= 10 ? get(9)/b.get(9) : 0,
-            N >= 9 ? get(8)/b.get(8) : 0,
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_epi32(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_epi32(m_value, a.m_value));
-      }
-  };
-
-}   // namespace expt
+    // AVX512F
+    _mm512_mask_i32scatter_epi32(ptr,
+                                 createMask(N),
+                                 createStridedOffsets(stride),
+                                 m_value,
+                                 sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+// GNU 7-10 are missing this instruction.
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
+#define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
+#endif
+
+    switch (i) {
+      case 0:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
+      case 1:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
+      case 2:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
+      case 3:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
+      case 4:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
+      case 5:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
+      case 6:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
+      case 7:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
+      case 8:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
+      case 9:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
+      case 10:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
+      case 11:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
+      case 12:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
+      case 13:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
+      case 14:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
+      case 15:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
+  {
+    m_value = _mm512_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(_mm512_add_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(_mm512_sub_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    return self_type(_mm512_mullo_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi32(get(15) / b.get(15),
+                                      get(14) / b.get(14),
+                                      get(13) / b.get(13),
+                                      get(12) / b.get(12),
+                                      get(11) / b.get(11),
+                                      get(10) / b.get(10),
+                                      get(9) / b.get(9),
+                                      get(8) / b.get(8),
+                                      get(7) / b.get(7),
+                                      get(6) / b.get(6),
+                                      get(5) / b.get(5),
+                                      get(4) / b.get(4),
+                                      get(3) / b.get(3),
+                                      get(2) / b.get(2),
+                                      get(1) / b.get(1),
+                                      get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi32(N >= 16 ? get(15) / b.get(15) : 0,
+                                      N >= 15 ? get(14) / b.get(14) : 0,
+                                      N >= 14 ? get(13) / b.get(13) : 0,
+                                      N >= 13 ? get(12) / b.get(12) : 0,
+                                      N >= 12 ? get(11) / b.get(11) : 0,
+                                      N >= 11 ? get(10) / b.get(10) : 0,
+                                      N >= 10 ? get(9) / b.get(9) : 0,
+                                      N >= 9 ? get(8) / b.get(8) : 0,
+                                      N >= 8 ? get(7) / b.get(7) : 0,
+                                      N >= 7 ? get(6) / b.get(6) : 0,
+                                      N >= 6 ? get(5) / b.get(5) : 0,
+                                      N >= 5 ? get(4) / b.get(4) : 0,
+                                      N >= 4 ? get(3) / b.get(3) : 0,
+                                      N >= 3 ? get(2) / b.get(2) : 0,
+                                      N >= 2 ? get(1) / b.get(1) : 0,
+                                      N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_epi32(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_epi32(m_value, a.m_value));
+  }
+};
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
index 17f929c607..7d72772a22 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -21,11 +21,12 @@
 #define RAJA_policy_vector_register_avx512_long_HPP
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
+#include "RAJA/util/macros.hpp"
 
 // Include SIMD intrinsics header file
 #include <immintrin.h>
+
 #include <cmath>
 
 
@@ -33,373 +34,403 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx512_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
+template <>
+class Register<int64_t, avx512_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type = Register<int64_t, avx512_register>;
+  using element_type = int64_t;
+  using register_type = __m512i;
+
+  using int_vector_type = Register<int64_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask8 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N) {
+      case 0:
+        return __mmask8(0x00);
+      case 1:
+        return __mmask8(0x01);
+      case 2:
+        return __mmask8(0x03);
+      case 3:
+        return __mmask8(0x07);
+      case 4:
+        return __mmask8(0x0F);
+      case 5:
+        return __mmask8(0x1F);
+      case 6:
+        return __mmask8(0x3F);
+      case 7:
+        return __mmask8(0x7F);
+      case 8:
+        return __mmask8(0xFF);
+    }
+    return __mmask8(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi64(stride);
+    auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi64(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi64(c))
+  {
+  }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    // AVX512F
+#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
+    (!defined(SYCL_LANGUAGE_VERSION) &&                             \
+     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+    m_value = _mm512_maskz_loadu_epi64(
+        ~0,
+        ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
+#else
+    m_value =
+        _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction, as
+                                  // is icpx as of version 2022.2.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value =
+        _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i64gather_epi64(createStridedOffsets(stride),
+                                     ptr,
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(),
+                                          createMask(N),
+                                          createStridedOffsets(stride),
+                                          ptr,
+                                          sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    // AVX512F
+#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
+    (!defined(SYCL_LANGUAGE_VERSION) &&                             \
+     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+    _mm512_mask_storeu_epi64(ptr,
+                             ~0,
+                             m_value);  // May cause slowdown due to looping
+                                        // over 8 bytes, one at a time.
+#else
+    _mm512_storeu_epi64(ptr,
+                        m_value);  // GNU 7-10 are missing this instruction, as
+                                   // is icpx as of version 2022.2.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t stride) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<int64_t, avx512_register>;
-      using element_type = int64_t;
-      using register_type = __m512i;
-
-      using int_vector_type = Register<int64_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask8 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0: return __mmask8(0x00);
-					case 1: return __mmask8(0x01);
-					case 2: return __mmask8(0x03);
-					case 3: return __mmask8(0x07);
-					case 4: return __mmask8(0x0F);
-					case 5: return __mmask8(0x1F);
-					case 6: return __mmask8(0x3F);
-					case 7: return __mmask8(0x7F);
-					case 8: return __mmask8(0xFF);
-				}
-				return __mmask8(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi64(stride);
-				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi64(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi64(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
-            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-        m_value = _mm512_maskz_loadu_epi64(~0, ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
-        #else
-        m_value = _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i64gather_epi64(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
-            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-        _mm512_mask_storeu_epi64(ptr, ~0, m_value);  // May cause slowdown due to looping over 8 bytes, one at a time.
-        #else
-        _mm512_storeu_epi64(ptr, m_value);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i64scatter_epi64(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i64scatter_epi64(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_epi64(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mullo_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi64(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi64(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_epi64(m_value);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_epi64(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_epi64(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_epi64(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_epi64(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    // AVX512F
+    _mm512_i64scatter_epi64(ptr,
+                            createStridedOffsets(stride),
+                            m_value,
+                            sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i64scatter_epi64(ptr,
+                                 createMask(N),
+                                 createStridedOffsets(stride),
+                                 m_value,
+                                 sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type &set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &value)
+  {
+    m_value = _mm512_set1_epi64(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(_mm512_add_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(_mm512_sub_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    return self_type(_mm512_mullo_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi64(get(7) / b.get(7),
+                                      get(6) / b.get(6),
+                                      get(5) / b.get(5),
+                                      get(4) / b.get(4),
+                                      get(3) / b.get(3),
+                                      get(2) / b.get(2),
+                                      get(1) / b.get(1),
+                                      get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, camp::idx_t N) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi64(N >= 8 ? get(7) / b.get(7) : 0,
+                                      N >= 7 ? get(6) / b.get(6) : 0,
+                                      N >= 6 ? get(5) / b.get(5) : 0,
+                                      N >= 5 ? get(4) / b.get(4) : 0,
+                                      N >= 4 ? get(3) / b.get(3) : 0,
+                                      N >= 3 ? get(2) / b.get(2) : 0,
+                                      N >= 2 ? get(1) / b.get(1) : 0,
+                                      N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_epi64(m_value); }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_epi64(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_epi64(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_epi64(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_epi64(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/traits.hpp b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
index b2b5cf6731..824c32e74a 100644
--- a/include/RAJA/policy/tensor/arch/avx512/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
@@ -21,53 +21,55 @@
 #ifndef RAJA_policy_tensor_arch_avx512_traits_HPP
 #define RAJA_policy_tensor_arch_avx512_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 16;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, int32_t> {
+  using element_type = int32_t;
+  using register_policy = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 16;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, int64_t> {
+  using element_type = int64_t;
+  using register_policy = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 16;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, float> {
+  using element_type = float;
+  using register_policy = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 16;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, double> {
+  using element_type = double;
+  using register_policy = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int64_t;
+};
 
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // guard
+#endif  // guard
 
 
-
-#endif // __AVX512F__
+#endif  // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/cuda.hpp b/include/RAJA/policy/tensor/arch/cuda.hpp
index a840c63d85..53b48ed64b 100644
--- a/include/RAJA/policy/tensor/arch/cuda.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda.hpp
@@ -21,11 +21,11 @@
 #ifndef RAJA_policy_tensor_arch_cuda_HPP
 #define RAJA_policy_tensor_arch_cuda_HPP
 
-#include<RAJA/policy/tensor/arch/cuda/traits.hpp>
-#include<RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
+#include <RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
+#include <RAJA/policy/tensor/arch/cuda/traits.hpp>
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
index e23eb92bed..4a9e7e07e6 100644
--- a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
@@ -17,10 +17,9 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/util/Operators.hpp"
+#include "RAJA/util/macros.hpp"
 
 #ifdef RAJA_ENABLE_CUDA
 
@@ -30,981 +29,995 @@
 #define RAJA_policy_tensor_arch_cuda_cuda_warp_register_HPP
 
 
-
 namespace RAJA
 {
 namespace expt
 {
 
-  template<typename ELEMENT_TYPE>
-  class Register<ELEMENT_TYPE, cuda_warp_register> :
-    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>
-  {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
-
-      using register_policy = cuda_warp_register;
-      using self_type = Register<ELEMENT_TYPE, cuda_warp_register>;
-      using element_type = ELEMENT_TYPE;
-      using register_type = ELEMENT_TYPE;
-
-      using int_vector_type = Register<int64_t, cuda_warp_register>;
-
-
-		private:
-      element_type m_value;
+template <typename ELEMENT_TYPE>
+class Register<ELEMENT_TYPE, cuda_warp_register>
+    : public internal::expt::RegisterBase<
+          Register<ELEMENT_TYPE, cuda_warp_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
+
+  using register_policy = cuda_warp_register;
+  using self_type = Register<ELEMENT_TYPE, cuda_warp_register>;
+  using element_type = ELEMENT_TYPE;
+  using register_type = ELEMENT_TYPE;
+
+  using int_vector_type = Register<int64_t, cuda_warp_register>;
+
+
+private:
+  element_type m_value;
+
+public:
+  static constexpr int s_num_elem = 32;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register() : base_type(), m_value(0) {}
+
+
+  /*!
+   * @brief Copy constructor from raw value
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(element_type c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment operator
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &operator=(element_type c)
+  {
+    m_value = c;
+    return *this;
+  }
+
+  /*!
+   * @brief Gets our warp lane
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr static int get_lane() { return threadIdx.x; }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  constexpr element_type const &get_raw_value() const { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  element_type &get_raw_value() { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return get_lane() == 0; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &load_packed(element_type const *ptr)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &load_packed_n(element_type const *ptr, int N)
+  {
+    auto lane = get_lane();
+    if (lane < N) {
+      m_value = ptr[lane];
+    } else {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &load_strided(element_type const *ptr, int stride)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[stride * lane];
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &load_strided_n(element_type const *ptr, int stride, int N)
+  {
+    auto lane = get_lane();
+
+    if (lane < N) {
+      m_value = ptr[stride * lane];
+    } else {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &gather(element_type const *ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get_raw_value()];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &gather_n(element_type const *ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
+    if (get_lane() < N) {
+      m_value = ptr[offsets.get_raw_value()];
+    } else {
+      m_value = element_type(0);
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type &segmented_load(element_type const *ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    m_value = ptr[seg * stride_outer + i * stride_inner];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type &segmented_load_nm(element_type const *ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner) {
+      m_value = element_type(0);
+    } else {
+      m_value = ptr[seg * stride_outer + i * stride_inner];
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const &store_packed(element_type *ptr) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const &store_packed_n(element_type *ptr, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N) {
+      ptr[lane] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const &store_strided(element_type *ptr, int stride) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane * stride] = m_value;
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const &store_strided_n(element_type *ptr, int stride, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N) {
+      ptr[lane * stride] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const &scatter(element_type *ptr,
+                                                   T2 const &offsets) const
+  {
+
+    ptr[offsets.get_raw_value()] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const &scatter_n(element_type *ptr,
+                                                     T2 const &offsets,
+                                                     camp::idx_t N) const
+  {
+    if (get_lane() < N) {
+      ptr[offsets.get_raw_value()] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const &segmented_store(element_type *ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    ptr[seg * stride_outer + i * stride_inner] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays where we store partial segments.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const &segmented_store_nm(element_type *ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
 
-		public:
-
-      static constexpr int s_num_elem = 32;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register() : base_type(), m_value(0) {
-
-      }
-
-
-      /*!
-       * @brief Copy constructor from raw value
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(element_type c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment operator
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(element_type c){
-        m_value = c;
-        return *this;
-      }
-
-      /*!
-       * @brief Gets our warp lane
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      static
-      int get_lane() {
-        return threadIdx.x;
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type const &get_raw_value() const {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      element_type &get_raw_value() {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return get_lane() == 0;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed(element_type const *ptr){
-
-        auto lane = get_lane();
-
-        m_value = ptr[lane];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed_n(element_type const *ptr, int N){
-        auto lane = get_lane();
-        if(lane < N){
-          m_value = ptr[lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided(element_type const *ptr, int stride){
-
-        auto lane = get_lane();
-
-        m_value = ptr[stride*lane];
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided_n(element_type const *ptr, int stride, int N){
-        auto lane = get_lane();
-
-        if(lane < N){
-          m_value = ptr[stride*lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get_raw_value()];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(get_lane() < N){
-          m_value = ptr[offsets.get_raw_value()];
-        }
-        else{
-          m_value = element_type(0);
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        m_value = ptr[seg*stride_outer + i*stride_inner];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          m_value = element_type(0);
-        }
-        else{
-          m_value = ptr[seg*stride_outer + i*stride_inner];
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed(element_type *ptr) const{
-
-        auto lane = get_lane();
-
-        ptr[lane] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed_n(element_type *ptr, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided(element_type *ptr, int stride) const{
-
-        auto lane = get_lane();
-
-        ptr[lane*stride] = m_value;
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane*stride] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
-
-        ptr[offsets.get_raw_value()] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
-        if(get_lane() < N){
-          ptr[offsets.get_raw_value()] = m_value;
-        }
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        ptr[seg*stride_outer + i*stride_inner] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays where we store partial segments.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          ptr[seg*stride_outer + i*stride_inner] = m_value;
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type get(int i) const
-			{
-        return  __shfl_sync(0xffffffff, m_value, i, 32);
-			}
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &set(element_type value, int i)
-			{
-				auto lane = get_lane();
-      	if(lane == i){
-					m_value = value;
-				}
-        return *this;
-			}
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
-        return x;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, int N) const {
-        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMA
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+    if (seg >= num_outer || i >= num_inner) {
+      // nop
+    } else {
+      ptr[seg * stride_outer + i * stride_inner] = m_value;
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
+  {
+    return __shfl_sync(0xffffffff, m_value, i, 32);
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &set(element_type value, int i)
+  {
+    auto lane = get_lane();
+    if (lane == i) {
+      m_value = value;
+    }
+    return *this;
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
+    return x;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, int N) const
+  {
+    return get_lane() < N ? self_type(m_value / b.m_value)
+                          : self_type(element_type(0));
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMA
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
+                              RETURN_TYPE>::type
       multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  {
+    return self_type(fma(m_value, b.m_value, c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
+                              RETURN_TYPE>::type
       multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value + c.m_value);
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMS
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  {
+    return self_type(m_value * b.m_value + c.m_value);
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMS
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
+                              RETURN_TYPE>::type
       multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, -c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  {
+    return self_type(fma(m_value, b.m_value, -c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
+                              RETURN_TYPE>::type
       multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value - c.m_value);
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type sum() const
-      {
-				// Allreduce sum
-				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-				return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max() const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max_n(int N) const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        auto ident = RAJA::operators::limits<element_type>::min();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmax(self_type a) const
-      {
-        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min() const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min_n(int N) const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        auto ident = RAJA::operators::limits<element_type>::max();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmin(self_type a) const
-      {
-        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
-      }
-
-
-
-
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        result.get_raw_value() = seg*stride_outer + i*stride_inner;
-
-        return result;
-      }
-
-
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
-
-          // tree shuffle
-          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane()<<segbits);
-
-        // Third: mask off everything but output_segment
-        //        this is because all output segments are valid at this point
-        // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-        int our_output_segment = get_lane()>>(5-segbits);
-        bool in_output_segment = our_output_segment == output_segment;
-        if(!in_output_segment){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      /*!
-       * Sum across segments, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums strided pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int i = 0;i < 5-segbits; ++ i){
-
-          // tree shuffle
-          int delta = s_num_elem >> (i+1);
-          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        int get_from = get_lane()&( (1<<segbits)-1);
-        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
-
-        int mask = (get_lane()>>segbits) == output_segment;
-
-
-        // Third: mask off everything but output_segment
-        if(!mask){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          result.get_raw_value() = m_value / den.get_raw_value();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      output_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-
-        camp::idx_t i = (get_lane()&mask) + offset;
-
-        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-        camp::idx_t i = (get_lane() >> segbits) + offset;
-
-        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-        return result;
-      }
-
-
-
-
-  };
-
-
-
-}   // namespace expt
-
-} // namespace RAJA
-
-
-#endif // Guard
-
-#endif // CUDA
+  {
+    return self_type(m_value * b.m_value - c.m_value);
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type sum() const
+  {
+    // Allreduce sum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max() const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max_n(int N) const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    auto ident = RAJA::operators::limits<element_type>::min();
+    auto lane = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmax(self_type a) const
+  {
+    return self_type{RAJA::max<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min() const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min_n(int N) const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    auto ident = RAJA::operators::limits<element_type>::max();
+    auto lane = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmin(self_type a) const
+  {
+    return self_type{RAJA::min<element_type>(m_value, a.m_value)};
+  }
+
+
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    result.get_raw_value() = seg * stride_outer + i * stride_inner;
+
+    return result;
+  }
+
+
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int delta = 1; delta < 1 << segbits; delta = delta << 1) {
+
+      // tree shuffle
+      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane() << segbits);
+
+    // Third: mask off everything but output_segment
+    //        this is because all output segments are valid at this point
+    // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
+    int our_output_segment = get_lane() >> (5 - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
+    if (!in_output_segment) {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  /*!
+   * Sum across segments, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums strided pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int i = 0; i < 5 - segbits; ++i) {
+
+      // tree shuffle
+      int delta = s_num_elem >> (i + 1);
+      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    int get_from = get_lane() & ((1 << segbits) - 1);
+    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
+
+    int mask = (get_lane() >> segbits) == output_segment;
+
+
+    // Third: mask off everything but output_segment
+    if (!mask) {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner) {
+      // nop
+    } else {
+      result.get_raw_value() = m_value / den.get_raw_value();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      output_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t mask = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
+
+
+    camp::idx_t i = (get_lane() & mask) + offset;
+
+    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+    camp::idx_t i = (get_lane() >> segbits) + offset;
+
+    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+    return result;
+  }
+};
+
+
+}  // namespace expt
+
+}  // namespace RAJA
+
+
+#endif  // Guard
+
+#endif  // CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/traits.hpp b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
index 032517677c..1bc389137e 100644
--- a/include/RAJA/policy/tensor/arch/cuda/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
@@ -21,26 +21,28 @@
 #ifndef RAJA_policy_tensor_arch_cuda_traits_HPP
 #define RAJA_policy_tensor_arch_cuda_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-  template<typename T>
-  struct RegisterTraits<RAJA::expt::cuda_warp_register, T>{
-      using element_type = T;
-      using register_policy = RAJA::expt::cuda_warp_register;
-      static constexpr camp::idx_t s_num_elem = 32;
-      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-      using int_element_type = int32_t;
-  };
-
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
-
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+template <typename T>
+struct RegisterTraits<RAJA::expt::cuda_warp_register, T> {
+  using element_type = T;
+  using register_policy = RAJA::expt::cuda_warp_register;
+  static constexpr camp::idx_t s_num_elem = 32;
+  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+  using int_element_type = int32_t;
+};
+
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip.hpp b/include/RAJA/policy/tensor/arch/hip.hpp
index 6e76772a29..1ad8e143b3 100644
--- a/include/RAJA/policy/tensor/arch/hip.hpp
+++ b/include/RAJA/policy/tensor/arch/hip.hpp
@@ -21,11 +21,11 @@
 #ifndef RAJA_policy_tensor_arch_hip_HPP
 #define RAJA_policy_tensor_arch_hip_HPP
 
-#include<RAJA/policy/tensor/arch/hip/traits.hpp>
-#include<RAJA/policy/tensor/arch/hip/hip_wave.hpp>
+#include <RAJA/policy/tensor/arch/hip/hip_wave.hpp>
+#include <RAJA/policy/tensor/arch/hip/traits.hpp>
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
index f1810807f9..9191c720cb 100644
--- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
@@ -17,10 +17,9 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include "RAJA/config.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/pattern/tensor/internal/RegisterBase.hpp"
-#include "RAJA/util/macros.hpp"
 #include "RAJA/util/Operators.hpp"
+#include "RAJA/util/macros.hpp"
 
 #ifdef RAJA_ENABLE_HIP
 
@@ -30,984 +29,1000 @@
 #define RAJA_policy_tensor_arch_hip_hip_wave_register_HPP
 
 
-
 namespace RAJA
 {
 namespace expt
 {
 
 
-  template<typename ELEMENT_TYPE>
-  class Register<ELEMENT_TYPE, hip_wave_register> :
-    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>
+template <typename ELEMENT_TYPE>
+class Register<ELEMENT_TYPE, hip_wave_register>
+    : public internal::expt::RegisterBase<
+          Register<ELEMENT_TYPE, hip_wave_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
+
+  using register_policy = hip_wave_register;
+  using self_type = Register<ELEMENT_TYPE, hip_wave_register>;
+  using element_type = ELEMENT_TYPE;
+  using register_type = ELEMENT_TYPE;
+
+  using int_vector_type = Register<int64_t, hip_wave_register>;
+
+
+private:
+  element_type m_value;
+
+public:
+  static constexpr int s_num_elem = policy::hip::device_constants.WARP_SIZE;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register() : base_type(), m_value(0) {}
+
+
+  /*!
+   * @brief Copy constructor from raw value
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(element_type c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment operator
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &operator=(element_type c)
+  {
+    m_value = c;
+    return *this;
+  }
+
+  /*!
+   * @brief Gets our warp lane
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr static int get_lane() { return threadIdx.x; }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  constexpr element_type const &get_raw_value() const { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  element_type &get_raw_value() { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return get_lane() == 0; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &load_packed(element_type const *ptr)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &load_packed_n(element_type const *ptr, int N)
+  {
+    auto lane = get_lane();
+    if (lane < N) {
+      m_value = ptr[lane];
+    } else {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &load_strided(element_type const *ptr, int stride)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[stride * lane];
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &load_strided_n(element_type const *ptr, int stride, int N)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
-
-      using register_policy = hip_wave_register;
-      using self_type = Register<ELEMENT_TYPE, hip_wave_register>;
-      using element_type = ELEMENT_TYPE;
-      using register_type = ELEMENT_TYPE;
-
-      using int_vector_type = Register<int64_t, hip_wave_register>;
-
+    auto lane = get_lane();
+
+    if (lane < N) {
+      m_value = ptr[stride * lane];
+    } else {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &gather(element_type const *ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get_raw_value()];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &gather_n(element_type const *ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
+    if (get_lane() < N) {
+      m_value = ptr[offsets.get_raw_value()];
+    } else {
+      m_value = element_type(0);
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type &segmented_load(element_type const *ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    m_value = ptr[seg * stride_outer + i * stride_inner];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type &segmented_load_nm(element_type const *ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner) {
+      m_value = element_type(0);
+    } else {
+      m_value = ptr[seg * stride_outer + i * stride_inner];
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const &store_packed(element_type *ptr) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const &store_packed_n(element_type *ptr, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N) {
+      ptr[lane] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const &store_strided(element_type *ptr, int stride) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane * stride] = m_value;
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const &store_strided_n(element_type *ptr, int stride, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N) {
+      ptr[lane * stride] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const &scatter(element_type *ptr,
+                                                   T2 const &offsets) const
+  {
+
+    ptr[offsets.get_raw_value()] = m_value;
+
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const &scatter_n(element_type *ptr,
+                                                     T2 const &offsets,
+                                                     camp::idx_t N) const
+  {
+    if (get_lane() < N) {
+      ptr[offsets.get_raw_value()] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const &segmented_store(element_type *ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    ptr[seg * stride_outer + i * stride_inner] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays where we store partial segments.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const &segmented_store_nm(element_type *ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
 
-		private:
-      element_type m_value;
+    if (seg >= num_outer || i >= num_inner) {
+      // nop
+    } else {
+      ptr[seg * stride_outer + i * stride_inner] = m_value;
+    }
 
-		public:
+    return *this;
+  }
 
-      static constexpr int s_num_elem = policy::hip::device_constants.WARP_SIZE;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register() : base_type(), m_value(0) {
-
-      }
-
-
-      /*!
-       * @brief Copy constructor from raw value
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(element_type c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment operator
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(element_type c){
-        m_value = c;
-        return *this;
-      }
-
-      /*!
-       * @brief Gets our warp lane
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      static
-      int get_lane() {
-        return threadIdx.x;
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type const &get_raw_value() const {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      element_type &get_raw_value() {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return get_lane() == 0;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed(element_type const *ptr){
-
-        auto lane = get_lane();
-
-        m_value = ptr[lane];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed_n(element_type const *ptr, int N){
-        auto lane = get_lane();
-        if(lane < N){
-          m_value = ptr[lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided(element_type const *ptr, int stride){
-
-        auto lane = get_lane();
-
-        m_value = ptr[stride*lane];
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided_n(element_type const *ptr, int stride, int N){
-        auto lane = get_lane();
-
-        if(lane < N){
-          m_value = ptr[stride*lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get_raw_value()];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(get_lane() < N){
-          m_value = ptr[offsets.get_raw_value()];
-        }
-        else{
-          m_value = element_type(0);
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        m_value = ptr[seg*stride_outer + i*stride_inner];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          m_value = element_type(0);
-        }
-        else{
-          m_value = ptr[seg*stride_outer + i*stride_inner];
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed(element_type *ptr) const{
-
-        auto lane = get_lane();
-
-        ptr[lane] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed_n(element_type *ptr, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided(element_type *ptr, int stride) const{
-
-        auto lane = get_lane();
-
-        ptr[lane*stride] = m_value;
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane*stride] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
-
-        ptr[offsets.get_raw_value()] = m_value;
-
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
-        if(get_lane() < N){
-          ptr[offsets.get_raw_value()] = m_value;
-        }
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        ptr[seg*stride_outer + i*stride_inner] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays where we store partial segments.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          ptr[seg*stride_outer + i*stride_inner] = m_value;
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type get(int i) const
-			{
-        return hip::impl::shfl_sync(m_value, i);
-			}
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &set(element_type value, int i)
-			{
-				auto lane = get_lane();
-      	if(lane == i){
-					m_value = value;
-				}
-        return *this;
-			}
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.m_value = hip::impl::shfl_sync(m_value, i, 32);
-        return x;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, int N) const {
-        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMA
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
+  {
+    return hip::impl::shfl_sync(m_value, i);
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type &set(element_type value, int i)
+  {
+    auto lane = get_lane();
+    if (lane == i) {
+      m_value = value;
+    }
+    return *this;
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.m_value = hip::impl::shfl_sync(m_value, i, 32);
+    return x;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const &b, int N) const
+  {
+    return get_lane() < N ? self_type(m_value / b.m_value)
+                          : self_type(element_type(0));
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMA
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
+                              RETURN_TYPE>::type
       multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  {
+    return self_type(fma(m_value, b.m_value, c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
+                              RETURN_TYPE>::type
       multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value + c.m_value);
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMS
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  {
+    return self_type(m_value * b.m_value + c.m_value);
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMS
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
+                              RETURN_TYPE>::type
       multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, -c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  {
+    return self_type(fma(m_value, b.m_value, -c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
+                              RETURN_TYPE>::type
       multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value - c.m_value);
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type sum() const
-      {
-				// Allreduce sum
-				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-				return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max() const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max_n(int N) const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        auto ident = RAJA::operators::limits<element_type>::min();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmax(self_type a) const
-      {
-        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min() const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min_n(int N) const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        auto ident = RAJA::operators::limits<element_type>::max();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmin(self_type a) const
-      {
-        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
-      }
-
-
-
-
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        result.get_raw_value() = seg*stride_outer + i*stride_inner;
-
-        return result;
-      }
-
-
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
-
-          // tree shuffle
-          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        result.get_raw_value() = hip::impl::shfl_sync(x, get_lane()<<segbits);
-
-        // Third: mask off everything but output_segment
-        //        this is because all output segments are valid at this point
-        static constexpr int log2_warp_size = RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE);
-        int our_output_segment = get_lane()>>(log2_warp_size-segbits);
-        bool in_output_segment = our_output_segment == output_segment;
-        if(!in_output_segment){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      /*!
-       * Sum across segments, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums strided pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        static constexpr int log2_warp_size = RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE);
-        RAJA_UNROLL
-        for(int i = 0;i < log2_warp_size-segbits; ++ i){
-
-          // tree shuffle
-          int delta = s_num_elem >> (i+1);
-          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        int get_from = get_lane()&( (1<<segbits)-1);
-        result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
-
-        int mask = (get_lane()>>segbits) == output_segment;
-
-
-        // Third: mask off everything but output_segment
-        if(!mask){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          result.get_raw_value() = m_value / den.get_raw_value();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      output_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-
-        camp::idx_t i = (get_lane()&mask) + offset;
-
-        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-        camp::idx_t i = (get_lane() >> segbits) + offset;
-
-        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-        return result;
-      }
-
-
-
-
-  };
-
-
-
-}   // namespace expt
-
-} // namespace RAJA
-
-
-#endif // Guard
-
-#endif // HIP
+  {
+    return self_type(m_value * b.m_value - c.m_value);
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type sum() const
+  {
+    // Allreduce sum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max() const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max_n(int N) const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    auto ident = RAJA::operators::limits<element_type>::min();
+    auto lane = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmax(self_type a) const
+  {
+    return self_type{RAJA::max<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min() const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min_n(int N) const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    auto ident = RAJA::operators::limits<element_type>::max();
+    auto lane = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmin(self_type a) const
+  {
+    return self_type{RAJA::min<element_type>(m_value, a.m_value)};
+  }
+
+
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    result.get_raw_value() = seg * stride_outer + i * stride_inner;
+
+    return result;
+  }
+
+
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int delta = 1; delta < 1 << segbits; delta = delta << 1) {
+
+      // tree shuffle
+      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    result.get_raw_value() = hip::impl::shfl_sync(x, get_lane() << segbits);
+
+    // Third: mask off everything but output_segment
+    //        this is because all output segments are valid at this point
+    static constexpr int log2_warp_size =
+        RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE);
+    int our_output_segment = get_lane() >> (log2_warp_size - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
+    if (!in_output_segment) {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  /*!
+   * Sum across segments, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums strided pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    static constexpr int log2_warp_size =
+        RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE);
+    RAJA_UNROLL
+    for (int i = 0; i < log2_warp_size - segbits; ++i) {
+
+      // tree shuffle
+      int delta = s_num_elem >> (i + 1);
+      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    int get_from = get_lane() & ((1 << segbits) - 1);
+    result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
+
+    int mask = (get_lane() >> segbits) == output_segment;
+
+
+    // Third: mask off everything but output_segment
+    if (!mask) {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner) {
+      // nop
+    } else {
+      result.get_raw_value() = m_value / den.get_raw_value();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      output_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t mask = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
+
+
+    camp::idx_t i = (get_lane() & mask) + offset;
+
+    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+    camp::idx_t i = (get_lane() >> segbits) + offset;
+
+    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+    return result;
+  }
+};
+
+
+}  // namespace expt
+
+}  // namespace RAJA
+
+
+#endif  // Guard
+
+#endif  // HIP
diff --git a/include/RAJA/policy/tensor/arch/hip/traits.hpp b/include/RAJA/policy/tensor/arch/hip/traits.hpp
index 1b8a9679bb..4cce58ee40 100644
--- a/include/RAJA/policy/tensor/arch/hip/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/traits.hpp
@@ -21,27 +21,30 @@
 #ifndef RAJA_policy_tensor_arch_hip_traits_HPP
 #define RAJA_policy_tensor_arch_hip_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
-  template<typename T>
-  struct RegisterTraits<RAJA::expt::hip_wave_register, T>{
-      using element_type = T;
-      using register_policy = RAJA::expt::hip_wave_register;
-      static constexpr camp::idx_t s_num_elem = policy::hip::device_constants.WARP_SIZE;
+template <typename T>
+struct RegisterTraits<RAJA::expt::hip_wave_register, T> {
+  using element_type = T;
+  using register_policy = RAJA::expt::hip_wave_register;
+  static constexpr camp::idx_t s_num_elem =
+      policy::hip::device_constants.WARP_SIZE;
 
-      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-      using int_element_type = int32_t;
-  };
-
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
+  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+  using int_element_type = int32_t;
+};
 
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
diff --git a/include/RAJA/policy/tensor/arch/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar.hpp
index 5e139f41f0..d81ab7c4af 100644
--- a/include/RAJA/policy/tensor/arch/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar.hpp
@@ -16,16 +16,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 
-
 #ifndef RAJA_policy_tensor_arch_scalar_HPP
 #define RAJA_policy_tensor_arch_scalar_HPP
 
 
-
-#include<RAJA/policy/tensor/arch/scalar/traits.hpp>
-#include<RAJA/policy/tensor/arch/scalar/scalar.hpp>
+#include <RAJA/policy/tensor/arch/scalar/scalar.hpp>
+#include <RAJA/policy/tensor/arch/scalar/traits.hpp>
 
 
 #endif
-
-
diff --git a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
index 139c5d27a5..f1252307ab 100644
--- a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
@@ -22,449 +22,455 @@
 
 namespace RAJA
 {
-namespace expt {
+namespace expt
+{
+
+/**
+ * A specialization for a single element register.
+ * We will implement this as a scalar value, and let the compiler use
+ * whatever registers it deems appropriate.
+ */
+template <typename T>
+class Register<T, scalar_register>
+    : public internal::expt::RegisterBase<Register<T, scalar_register>>
+{
+public:
+  using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
+
+  using register_policy = scalar_register;
+  using self_type = Register<T, scalar_register>;
+  using element_type = T;
+  using register_type = T;
+
+  using int_vector_type =
+      Register<typename internal::expt::RegisterTraits<scalar_register,
+                                                       T>::int_element_type,
+               scalar_register>;
+
+
+private:
+  T m_value;
+
+public:
+  static constexpr camp::idx_t s_num_elem = 1;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register() : base_type(), m_value(0) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register(element_type const &c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &operator=(self_type const &c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_packed(element_type const *ptr)
+  {
+    m_value = ptr[0];
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_packed_n(element_type const *ptr, camp::idx_t N)
+  {
+    if (N > 0) {
+      m_value = ptr[0];
+    } else {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_strided(element_type const *ptr, camp::idx_t)
+  {
+    m_value = ptr[0];
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &load_strided_n(element_type const *ptr, camp::idx_t, camp::idx_t N)
+  {
+    if (N > 0) {
+      m_value = ptr[0];
+    } else {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type &gather(element_type const *ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get(0)];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type &gather_n(element_type const *ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
+    if (N > 0) {
+      m_value = ptr[offsets.get(0)];
+    } else {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_packed(element_type *ptr) const
+  {
+    ptr[0] = m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const
+  {
+    if (N > 0) {
+      ptr[0] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_strided(element_type *ptr, camp::idx_t) const
+  {
+    ptr[0] = m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const &store_strided_n(element_type *ptr,
+                                   camp::idx_t,
+                                   camp::idx_t N) const
+  {
+    if (N > 0) {
+      ptr[0] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type const &scatter(element_type *ptr, int_vector_type offsets) const
+  {
+
+    ptr[offsets.get(0)] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type const &scatter_n(element_type *ptr,
+                             int_vector_type offsets,
+                             camp::idx_t N) const
+  {
+    if (N > 0) {
+      ptr[offsets.get(0)] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE element_type get(camp::idx_t) const
+  {
+    return m_value;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type &set(element_type value, camp::idx_t)
+  {
+    m_value = value;
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &broadcast(element_type const &a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type &copy(self_type const &src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const &b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const &b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const &b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const &b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+  /*!
+   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+   *
+   * Derived types can override this to implement intrinsic FMA's
+   *
+   * @param b Second product operand
+   * @param c Sum operand
+   * @return Value of (*this)*b+c
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const &b, self_type const &c) const
+  {
+    return m_value * b.m_value + c.m_value;
+  }
+
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const &b, self_type const &c) const
+  {
+    return m_value * b.m_value - c.m_value;
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type sum() const { return m_value; }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type dot(self_type const &b) const
+  {
+    return m_value * b.m_value;
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type max() const { return m_value; }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return N ? m_value : RAJA::operators::limits<element_type>::min();
+    ;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(RAJA::max<element_type>(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the smallest element
+   * @return The smallest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type min() const { return m_value; }
+
+  /*!
+   * @brief Returns the smallest element from first N lanes
+   * @return The smallest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return N ? m_value : RAJA::operators::limits<element_type>::max();
+    ;
+  }
 
-  /**
-   * A specialization for a single element register.
-   * We will implement this as a scalar value, and let the compiler use
-   * whatever registers it deems appropriate.
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
    */
-  template<typename T>
-  class Register<T, scalar_register> :
-      public internal::expt::RegisterBase<Register<T, scalar_register>>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmin(self_type a) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
-
-      using register_policy = scalar_register;
-      using self_type = Register<T, scalar_register>;
-      using element_type = T;
-      using register_type = T;
-
-      using int_vector_type = Register<typename internal::expt::RegisterTraits<scalar_register, T>::int_element_type, scalar_register>;
-
-
-    private:
-      T m_value;
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 1;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register() : base_type(), m_value(0) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register(element_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = ptr[0];
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[0];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t ){
-        m_value = ptr[0];
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t , camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[0];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get(0)];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[offsets.get(0)];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        ptr[0] = m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        if(N > 0){
-          ptr[0] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t ) const{
-        ptr[0] = m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t , camp::idx_t N) const{
-        if(N > 0){
-          ptr[0] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, int_vector_type offsets) const {
-
-        ptr[offsets.get(0)] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, int_vector_type offsets, camp::idx_t N) const {
-        if(N > 0){
-          ptr[offsets.get(0)] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type get(camp::idx_t) const
-      {return m_value;}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &set(element_type value, camp::idx_t)
-      {
-        m_value = value;
-        return *this;
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-      /*!
-       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-       *
-       * Derived types can override this to implement intrinsic FMA's
-       *
-       * @param b Second product operand
-       * @param c Sum operand
-       * @return Value of (*this)*b+c
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return m_value * b.m_value + c.m_value;
-      }
-
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return m_value * b.m_value - c.m_value;
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type sum() const
-      {
-        return m_value;
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type dot(self_type const &b) const
-      {
-        return m_value * b.m_value;
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type max() const
-      {
-        return m_value;
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        return N ? m_value : RAJA::operators::limits<element_type>::min();;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(RAJA::max<element_type>(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the smallest element
-       * @return The smallest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type min() const
-      {
-        return m_value;
-      }
-
-      /*!
-       * @brief Returns the smallest element from first N lanes
-       * @return The smallest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        return N ? m_value : RAJA::operators::limits<element_type>::max();;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(RAJA::min<element_type>(m_value, a.m_value));
-      }
-
-
-
-  };
-} // namespace expt
+    return self_type(RAJA::min<element_type>(m_value, a.m_value));
+  }
+};
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/policy/tensor/arch/scalar/traits.hpp b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
index dfeccbb86f..8080edc31a 100644
--- a/include/RAJA/policy/tensor/arch/scalar/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
@@ -19,52 +19,53 @@
 #ifndef RAJA_policy_tensor_arch_scalar_traits_HPP
 #define RAJA_policy_tensor_arch_scalar_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, int32_t> {
+  using element_type = int32_t;
+  using register_policy = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, int64_t> {
+  using element_type = int64_t;
+  using register_policy = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, float> {
+  using element_type = float;
+  using register_policy = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, double> {
+  using element_type = double;
+  using register_policy = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type = int64_t;
+};
 
 
-}
-}
-}
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 #endif
-
-
diff --git a/include/RAJA/policy/tensor/arch_impl.hpp b/include/RAJA/policy/tensor/arch_impl.hpp
index e14451505a..0e7085b5e2 100644
--- a/include/RAJA/policy/tensor/arch_impl.hpp
+++ b/include/RAJA/policy/tensor/arch_impl.hpp
@@ -22,7 +22,6 @@
 #include "RAJA/policy/tensor/arch.hpp"
 
 
-
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -32,30 +31,29 @@
 //
 
 #ifdef __AVX512F__
-#include<RAJA/policy/tensor/arch/avx512.hpp>
+#include <RAJA/policy/tensor/arch/avx512.hpp>
 #endif
 
 
 #ifdef __AVX2__
-#include<RAJA/policy/tensor/arch/avx2.hpp>
+#include <RAJA/policy/tensor/arch/avx2.hpp>
 #endif
 
 
 #ifdef __AVX__
-#include<RAJA/policy/tensor/arch/avx.hpp>
+#include <RAJA/policy/tensor/arch/avx.hpp>
 #endif
 
 #ifdef RAJA_CUDA_ACTIVE
-#include<RAJA/policy/tensor/arch/cuda.hpp>
+#include <RAJA/policy/tensor/arch/cuda.hpp>
 #endif
 
 #ifdef RAJA_HIP_ACTIVE
-#include<RAJA/policy/tensor/arch/hip.hpp>
+#include <RAJA/policy/tensor/arch/hip.hpp>
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-#include<RAJA/policy/tensor/arch/scalar.hpp>
-
+#include <RAJA/policy/tensor/arch/scalar.hpp>
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/policy.hpp b/include/RAJA/policy/tensor/policy.hpp
index 8618d543b2..a4d3ebeaf9 100644
--- a/include/RAJA/policy/tensor/policy.hpp
+++ b/include/RAJA/policy/tensor/policy.hpp
@@ -18,8 +18,8 @@
 #ifndef RAJA_policy_tensor_policy_HPP
 #define RAJA_policy_tensor_policy_HPP
 
-#include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/config.hpp"
+#include "RAJA/policy/PolicyBase.hpp"
 
 
 //
@@ -40,7 +40,10 @@ namespace policy
 namespace tensor
 {
 
-template<typename EXEC_POLICY, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t TILE_SIZE>
+template <typename EXEC_POLICY,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          camp::idx_t TILE_SIZE>
 struct tensor_exec : public EXEC_POLICY {
   using exec_policy = EXEC_POLICY;
   using tensor_type = TENSOR_TYPE;
@@ -50,27 +53,28 @@ struct tensor_exec : public EXEC_POLICY {
 };
 
 
-
 }  // end of namespace tensor
 
 }  // end of namespace policy
 
-namespace expt {
-
-
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using vector_exec = policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
+namespace expt
+{
 
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_row_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_col_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using vector_exec =
+    policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using matrix_row_exec =
+    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
-} //  namespace expt
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using matrix_col_exec =
+    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
 
 
+}  //  namespace expt
 
 
 }  // end of namespace RAJA
diff --git a/include/RAJA/util/BitMask.hpp b/include/RAJA/util/BitMask.hpp
index 63f011b689..59e94e08a0 100644
--- a/include/RAJA/util/BitMask.hpp
+++ b/include/RAJA/util/BitMask.hpp
@@ -24,61 +24,59 @@
 namespace RAJA
 {
 
-  template<camp::idx_t N>
-  struct LogBase2
-  {
-      static constexpr camp::idx_t value = LogBase2<(N>>1)>::value + 1;
-      static constexpr bool is_exact = ((1<<value) == N);
-  };
-
-  template<>
-  struct LogBase2<0>
-  {
-      static constexpr camp::idx_t value = -1;
-      static constexpr bool is_exact = true;
-  };
+template <camp::idx_t N>
+struct LogBase2 {
+  static constexpr camp::idx_t value = LogBase2<(N >> 1)>::value + 1;
+  static constexpr bool is_exact = ((1 << value) == N);
+};
 
-  /*!
-   * A bit-masking operator
-   *
-   * Provides an operator that shifts and masks in input value to extract
-   * a contiguous set of bits.
-   *
-   * result = (input >> Shift) & (Mask)
-   *
-   * Where mask is (1<<Width)-1, or the number of bits defined by Width.
-   *
-   *
-   */
-  template<int Width, int Shift>
-  struct BitMask {
-    static constexpr int shift = Shift;
-    static constexpr int width = Width;
-    static constexpr int max_input_size = 1<<(Shift+Width);
-    static constexpr int max_masked_size = 1<<Width;
-    static constexpr int max_shifted_size = 1<<Shift;
+template <>
+struct LogBase2<0> {
+  static constexpr camp::idx_t value = -1;
+  static constexpr bool is_exact = true;
+};
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T maskValue(T input) {
-      return( (input>>( static_cast<T>(Shift) )) & static_cast<T>((1<<(Width))-1) );
-    }
+/*!
+ * A bit-masking operator
+ *
+ * Provides an operator that shifts and masks in input value to extract
+ * a contiguous set of bits.
+ *
+ * result = (input >> Shift) & (Mask)
+ *
+ * Where mask is (1<<Width)-1, or the number of bits defined by Width.
+ *
+ *
+ */
+template <int Width, int Shift>
+struct BitMask {
+  static constexpr int shift = Shift;
+  static constexpr int width = Width;
+  static constexpr int max_input_size = 1 << (Shift + Width);
+  static constexpr int max_masked_size = 1 << Width;
+  static constexpr int max_shifted_size = 1 << Shift;
 
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T maskValue(T input)
+  {
+    return ((input >> (static_cast<T>(Shift))) &
+            static_cast<T>((1 << (Width)) - 1));
+  }
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T getOuter(T input) {
-      return(  (input>>(static_cast<T>(Shift))) >> Width );
-    }
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T maskOuter(T input) {
-      return( input & (static_cast<T>(-1) << (Width+Shift) )  );
-    }
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T getOuter(T input)
+  {
+    return ((input >> (static_cast<T>(Shift))) >> Width);
+  }
 
-  };
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T maskOuter(T input)
+  {
+    return (input & (static_cast<T>(-1) << (Width + Shift)));
+  }
+};
 
 }  // namespace RAJA
 
-#endif //RAJA_util_BitMask_HPP
+#endif  // RAJA_util_BitMask_HPP
diff --git a/include/RAJA/util/CombiningAdapter.hpp b/include/RAJA/util/CombiningAdapter.hpp
index abe8197b93..faebe59d8f 100644
--- a/include/RAJA/util/CombiningAdapter.hpp
+++ b/include/RAJA/util/CombiningAdapter.hpp
@@ -21,11 +21,11 @@
 #include <type_traits>
 
 #include "RAJA/index/RangeSegment.hpp"
+#include "RAJA/util/Layout.hpp"
+#include "RAJA/util/OffsetLayout.hpp"
 #include "RAJA/util/camp_aliases.hpp"
 #include "RAJA/util/concepts.hpp"
 #include "RAJA/util/macros.hpp"
-#include "RAJA/util/Layout.hpp"
-#include "RAJA/util/OffsetLayout.hpp"
 
 namespace RAJA
 {
@@ -78,8 +78,7 @@ namespace RAJA
  *
  */
 template <typename Lambda, typename Layout_>
-struct CombiningAdapter
-{
+struct CombiningAdapter {
   using Layout = Layout_;
 
   using IndexRange = typename Layout::IndexRange;
@@ -95,10 +94,11 @@ struct CombiningAdapter
   Layout m_layout;
 
   RAJA_SUPPRESS_HD_WARN
-  template < camp::idx_t... RangeInts >
+  template <camp::idx_t... RangeInts>
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>)
-    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+      -> decltype(m_lambda(
+          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
@@ -106,10 +106,11 @@ struct CombiningAdapter
   }
   ///
   RAJA_SUPPRESS_HD_WARN
-  template < camp::idx_t... RangeInts >
+  template <camp::idx_t... RangeInts>
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>) const
-    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+      -> decltype(m_lambda(
+          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
@@ -117,14 +118,13 @@ struct CombiningAdapter
   }
 
 public:
-
   /*!
    * Constructor from lambda and layout.
    */
-  template < typename C_Lambda, typename C_Layout >
+  template <typename C_Lambda, typename C_Layout>
   RAJA_HOST_DEVICE CombiningAdapter(C_Lambda&& lambda, C_Layout&& layout)
-      : m_lambda(std::forward<C_Lambda>(lambda))
-      , m_layout(std::forward<C_Layout>(layout))
+      : m_lambda(std::forward<C_Lambda>(lambda)),
+        m_layout(std::forward<C_Layout>(layout))
   {
   }
 
@@ -134,13 +134,13 @@ struct CombiningAdapter
    * @return return value of lambda
    */
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index)
-    -> decltype(call_helper(linear_index, IndexRange()))
+      -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
   ///
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index) const
-    -> decltype(call_helper(linear_index, IndexRange()))
+      -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
@@ -207,9 +207,10 @@ struct CombiningAdapter
  *
  */
 template <typename Lambda, typename Layout>
-RAJA_HOST_DEVICE RAJA_INLINE
-auto make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
-  // -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
+RAJA_HOST_DEVICE RAJA_INLINE auto make_CombiningAdapter_from_layout(
+    Lambda&& lambda,
+    Layout&& layout)
+// -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
 {
   return CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>(
       std::forward<Lambda>(lambda), std::forward<Layout>(layout));
@@ -217,48 +218,54 @@ auto make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Lambda, typename... IdxTs>
-RAJA_INLINE
-auto make_CombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-  //             camp::val<RAJA::TypedOffsetLayout<
-  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
-  //                 IdxTs...>>()))
+RAJA_INLINE auto make_CombiningAdapter(
+    Lambda&& lambda,
+    ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+//             camp::val<RAJA::TypedOffsetLayout<
+//                 typename std::common_type< strip_index_type_t<IdxTs>...
+//                 >::type, IdxTs...>>()))
 {
-  using std::begin; using std::end; using std::distance;
-  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
   using Layout = RAJA::Layout<sizeof...(IdxTs), IdxLin>;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   Layout layout(static_cast<IdxLin>(distance(begin(segs), end(segs)))...);
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                            : static_cast<IdxLin>(0))...}},
-        std::move(layout));
+      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                          : static_cast<IdxLin>(0))...}},
+      std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Perm, typename Lambda, typename... IdxTs>
-RAJA_INLINE
-auto make_PermutedCombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-  //             camp::val<RAJA::TypedOffsetLayout<
-  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
-  //                 IdxTs...>>()))
+RAJA_INLINE auto make_PermutedCombiningAdapter(
+    Lambda&& lambda,
+    ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+//             camp::val<RAJA::TypedOffsetLayout<
+//                 typename std::common_type< strip_index_type_t<IdxTs>...
+//                 >::type, IdxTs...>>()))
 {
-  using std::begin; using std::end; using std::distance;
-  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   auto layout = make_permuted_layout<sizeof...(IdxTs), IdxLin>(
-              {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
-              RAJA::as_array<Perm>::get());
+      {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
+      RAJA::as_array<Perm>::get());
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                            : static_cast<IdxLin>(0))...}},
+      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                          : static_cast<IdxLin>(0))...}},
 
-        std::move(layout));
+      std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
diff --git a/include/RAJA/util/EnableIf.hpp b/include/RAJA/util/EnableIf.hpp
index 257e852bf9..3c0605c5ea 100644
--- a/include/RAJA/util/EnableIf.hpp
+++ b/include/RAJA/util/EnableIf.hpp
@@ -20,15 +20,13 @@
 #ifndef RAJA_util_EnableIf_HPP
 #define RAJA_util_EnableIf_HPP
 
-#include "RAJA/config.hpp"
-
 #include <type_traits>
 
+#include "RAJA/config.hpp"
+#include "RAJA/util/concepts.hpp"
 #include "camp/list.hpp"
 #include "camp/type_traits.hpp"
 
-#include "RAJA/util/concepts.hpp"
-
 
 namespace RAJA
 {
@@ -41,14 +39,16 @@ struct is_any_of;
 
 template <typename T, typename... Types>
 struct is_any_of<T, ::camp::list<Types...>>
-  : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
-{};
+    : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...> {
+};
 
 template <typename T, typename TypeList>
 using enable_if_is_any_of = std::enable_if_t<is_any_of<T, TypeList>::value, T>;
 
 template <typename T, typename TypeList>
-using enable_if_is_none_of = std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value, T>;
+using enable_if_is_none_of =
+    std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value,
+                     T>;
 
 
 }  // namespace util
diff --git a/include/RAJA/util/IndexLayout.hpp b/include/RAJA/util/IndexLayout.hpp
index 6bb308d375..bae5ca3b60 100644
--- a/include/RAJA/util/IndexLayout.hpp
+++ b/include/RAJA/util/IndexLayout.hpp
@@ -3,7 +3,8 @@
  *
  * \file
  *
- * \brief   RAJA header file defining the IndexLayout class and IndexList classes.
+ * \brief   RAJA header file defining the IndexLayout class and IndexList
+ *classes.
  *
  ******************************************************************************
  */
@@ -20,52 +21,56 @@
 
 #include "RAJA/util/Layout.hpp"
 
-namespace RAJA 
+namespace RAJA
 {
 
 /*!
-* DirectIndex struct contains call operator that returns the same index that was input
-*
-*/
-template<typename IdxLin = Index_type>
+ * DirectIndex struct contains call operator that returns the same index that
+ * was input
+ *
+ */
+template <typename IdxLin = Index_type>
 struct DirectIndex {
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(
+      const IdxLin idx) const
   {
     return idx;
   }
-
 };
 
 /*!
-* IndexList struct stores a pointer to an array containing the index list.
-* Its call operator returns the entry at the input location (idx) of its index list.
-* 
-*/
-template<typename IdxLin = Index_type>
+ * IndexList struct stores a pointer to an array containing the index list.
+ * Its call operator returns the entry at the input location (idx) of its index
+ * list.
+ *
+ */
+template <typename IdxLin = Index_type>
 struct IndexList {
 
   IdxLin* index_list{nullptr};
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(
+      const IdxLin idx) const
   {
     return index_list[idx];
   }
-
 };
 
 /*!
-* ConditionalIndexList struct stores a pointer to an array containing the index list.
-* Its call operator returns the same index that was input if the index list is a nullptr, 
-* or otherwise returns the entry at the input location (idx) of its index list.
-* 
-*/
-template<typename IdxLin = Index_type>
+ * ConditionalIndexList struct stores a pointer to an array containing the index
+ * list. Its call operator returns the same index that was input if the index
+ * list is a nullptr, or otherwise returns the entry at the input location (idx)
+ * of its index list.
+ *
+ */
+template <typename IdxLin = Index_type>
 struct ConditionalIndexList {
 
-  IdxLin* index_list{nullptr};  
+  IdxLin* index_list{nullptr};
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(
+      const IdxLin idx) const
   {
     if (index_list) {
       return index_list[idx];
@@ -73,13 +78,12 @@ struct ConditionalIndexList {
       return idx;
     }
   }
-
 };
 
 namespace internal
 {
 
-template<typename Range, typename IdxLin, typename... IndexTypes>
+template <typename Range, typename IdxLin, typename... IndexTypes>
 struct IndexLayout_impl;
 
 template <camp::idx_t... RangeInts, typename IdxLin, typename... IndexTypes>
@@ -97,73 +101,78 @@ struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...> {
   constexpr RAJA_INLINE IndexLayout_impl(
       camp::tuple<IndexTypes...> index_tuple_in,
       Types... ns)
-      : base_{(ns)...},
-        tuple(index_tuple_in)
+      : base_{(ns)...}, tuple(index_tuple_in)
   {
   }
 
   /*!
    * Computes a linear space index from entries of index lists stored in tuple.
-   * This is accomplished through the inner product of the strides and the 
+   * This is accomplished through the inner product of the strides and the
    * entry in the index list along each dimension.
    * @param indices Indices in the n-dimensional space of this layout
    * @return Linear space index.
-   */  
+   */
   template <typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
       Indices... indices) const
   {
     return sum<IdxLin>(
-      (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
+        (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
   }
-
 };
 
-} // namespace internal
+}  // namespace internal
 
 
-template <size_t n_dims = 1, typename IdxLin = Index_type, typename... IndexTypes>
+template <size_t n_dims = 1,
+          typename IdxLin = Index_type,
+          typename... IndexTypes>
 struct IndexLayout
-    : public internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...> {
-  using Base =
-      internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
+    : public internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
+                                        IdxLin,
+                                        IndexTypes...> {
+  using Base = internal::
+      IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
 
   using internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
-                                    IdxLin, IndexTypes...>::IndexLayout_impl;
+                                   IdxLin,
+                                   IndexTypes...>::IndexLayout_impl;
 
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE IndexLayout(
-      const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>&
-          rhs)
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  IndexLayout(const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
+                                               IdxLin,
+                                               IndexTypes...>& rhs)
       : Base{rhs}
   {
   }
-
 };
 
 /*!
- * creates of a camp::tuple of index types 
+ * creates of a camp::tuple of index types
  * (such as DirectIndex, IndexList, or ConditionalIndexList)
  *
  */
 template <typename... IndexTypes>
 auto make_index_tuple(IndexTypes... it) -> camp::tuple<IndexTypes...>
 {
-    return camp::tuple<IndexTypes...>(it...);
+  return camp::tuple<IndexTypes...>(it...);
 }
 
 /*!
  * creates an index layout based on the input camp::tuple of index types
  *
- */  
-template <typename IdxLin = Index_type, typename... Types, typename... IndexTypes>
-auto make_index_layout(
-  camp::tuple<IndexTypes...> index_tuple_in,
-  Types... ns) -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
+ */
+template <typename IdxLin = Index_type,
+          typename... Types,
+          typename... IndexTypes>
+auto make_index_layout(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
+    -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
 {
-    static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
-    return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in, ns...);
+  static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
+  return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in,
+                                                              ns...);
 }
 
-}
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/KokkosPluginLoader.hpp b/include/RAJA/util/KokkosPluginLoader.hpp
index c5060a0a96..7812306b71 100644
--- a/include/RAJA/util/KokkosPluginLoader.hpp
+++ b/include/RAJA/util/KokkosPluginLoader.hpp
@@ -14,39 +14,44 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
-  class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
-  {
-  public:
-    using Parent = ::RAJA::util::PluginStrategy;
-    typedef void (*init_function)(const int, const uint64_t, const uint32_t, void*);
-    typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
-    typedef void (*post_function)(uint64_t);
-    typedef void (*finalize_function)();
+class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
+{
+public:
+  using Parent = ::RAJA::util::PluginStrategy;
+  typedef void (*init_function)(const int,
+                                const uint64_t,
+                                const uint32_t,
+                                void*);
+  typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
+  typedef void (*post_function)(uint64_t);
+  typedef void (*finalize_function)();
 
-    KokkosPluginLoader();
+  KokkosPluginLoader();
 
-    void preLaunch(const RAJA::util::PluginContext& p) override;
+  void preLaunch(const RAJA::util::PluginContext& p) override;
 
-    void postLaunch(const RAJA::util::PluginContext& p) override;
+  void postLaunch(const RAJA::util::PluginContext& p) override;
 
-    void finalize() override;
+  void finalize() override;
 
-  private:
-    void initPlugin(const std::string &path);
-    
-    void initDirectory(const std::string &path);
+private:
+  void initPlugin(const std::string& path);
 
-    std::vector<init_function> init_functions;
-    std::vector<pre_function> pre_functions;
-    std::vector<post_function> post_functions;
-    std::vector<finalize_function> finalize_functions;
+  void initDirectory(const std::string& path);
 
-  };  // end KokkosPluginLoader class
+  std::vector<init_function> init_functions;
+  std::vector<pre_function> pre_functions;
+  std::vector<post_function> post_functions;
+  std::vector<finalize_function> finalize_functions;
 
-  void linkKokkosPluginLoader();
+};  // end KokkosPluginLoader class
+
+void linkKokkosPluginLoader();
 
 }  // end namespace util
 }  // end namespace RAJA
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 948e37f498..d981964076 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -18,16 +18,13 @@
 #ifndef RAJA_LAYOUT_HPP
 #define RAJA_LAYOUT_HPP
 
-#include "RAJA/config.hpp"
-
 #include <cassert>
 #include <iostream>
 #include <limits>
 
+#include "RAJA/config.hpp"
 #include "RAJA/index/IndexValue.hpp"
-
 #include "RAJA/internal/foldl.hpp"
-
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/Permutations.hpp"
 
@@ -38,7 +35,6 @@ namespace detail
 {
 
 
-
 template <typename Range,
           typename IdxLin = Index_type,
           ptrdiff_t StrideOneDim = -1>
@@ -90,10 +86,8 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   constexpr RAJA_INLINE LayoutBase_impl() = default;
   constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const &) = default;
   constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl &&) = default;
-  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl const &) =
-      default;
-  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl &&) =
-      default;
+  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl const &) = default;
+  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl &&) = default;
 
   /*!
    * Construct a layout given the size of each dimension.
@@ -143,11 +137,13 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   /*!
    * Methods to performs bounds checking in layout objects
    */
-  template<camp::idx_t N, typename Idx>
+  template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [0, %ld] \n",
-           static_cast<int>(N), static_cast<long int>(idx), static_cast<long int>(sizes[N] - 1));
+           static_cast<int>(N),
+           static_cast<long int>(idx),
+           static_cast<long int>(sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
@@ -160,8 +156,7 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
                                                 Indices... indices) const
   {
-    if(sizes[N] > 0 && !(0<=idx && idx < static_cast<Idx>(sizes[N])))
-    {
+    if (sizes[N] > 0 && !(0 <= idx && idx < static_cast<Idx>(sizes[N]))) {
       BoundsCheckError<N>(idx);
     }
     RAJA_UNUSED_VAR(idx);
@@ -180,16 +175,16 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
   operator()(Indices... indices) const
   {
-#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     // dot product of strides and indices
-    return sum<IdxLin>(
-      (RangeInts==stride_one_dim ?   // Is this dimension stride-one?
-         indices :  // it's stride one, so dont bother with multiply
-         strides[RangeInts]*indices // it's not stride one
-			)...
-    );
+    return sum<IdxLin>((RangeInts == stride_one_dim
+                            ?  // Is this dimension stride-one?
+                            indices
+                            :  // it's stride one, so dont bother with multiply
+                            strides[RangeInts] * indices  // it's not stride one
+                        )...);
   }
 
 
@@ -205,20 +200,21 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    */
   template <typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices &&... indices) const
+                                              Indices &&...indices) const
   {
 #if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     IdxLin totSize = size_noproj();
-    if(totSize > 0 && (linear_index < 0 || linear_index >= totSize)) {
+    if (totSize > 0 && (linear_index < 0 || linear_index >= totSize)) {
       printf("Error! Linear index %ld is not within bounds [0, %ld]. \n",
-             static_cast<long int>(linear_index), static_cast<long int>(totSize-1));
+             static_cast<long int>(linear_index),
+             static_cast<long int>(totSize - 1));
       RAJA_ABORT_OR_THROW("Out of bounds error \n");
-     }
+    }
 #endif
 
-    camp::sink((indices =
-      (camp::decay<Indices>)((linear_index / inv_strides[RangeInts]) %
-                             inv_mods[RangeInts]))...);
+    camp::sink((indices = (camp::decay<Indices>)((linear_index /
+                                                  inv_strides[RangeInts]) %
+                                                 inv_mods[RangeInts]))...);
   }
 
   /*!
@@ -232,7 +228,8 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
     // Multiply together all of the sizes,
     // replacing 1 for any zero-sized dimensions
     return foldl(RAJA::operators::multiplies<IdxLin>(),
-                         (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
+                 (sizes[RangeInts] == IdxLin(0) ? IdxLin(1)
+                                                : sizes[RangeInts])...);
   }
 
   /*!
@@ -247,27 +244,21 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
     return foldl(RAJA::operators::multiplies<IdxLin>(), sizes[RangeInts]...);
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return strides[DIM];
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return sizes[DIM];
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
 };
@@ -338,7 +329,9 @@ struct TypedLayout;
 
 template <typename IdxLin, typename... DimTypes, ptrdiff_t StrideOne>
 struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
-    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne> {
+    : public Layout<sizeof...(DimTypes),
+                    strip_index_type_t<IdxLin>,
+                    StrideOne> {
 
   using StrippedIdxLin = strip_index_type_t<IdxLin>;
   using Self = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
@@ -374,7 +367,7 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    *                 dimensionality of this layout.
    */
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes &... indices) const
+                                              DimTypes &...indices) const
   {
     toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
                     std::forward<IdxLin>(linear_index),
@@ -392,11 +385,11 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
   template <typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices &... indices) const
+                                                    Indices &...indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-		camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
+    camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
diff --git a/include/RAJA/util/LocalArray.hpp b/include/RAJA/util/LocalArray.hpp
index 50680101d4..2f9986b68b 100644
--- a/include/RAJA/util/LocalArray.hpp
+++ b/include/RAJA/util/LocalArray.hpp
@@ -19,11 +19,10 @@
 #ifndef RAJA_util_LocalArray_HPP
 #define RAJA_util_LocalArray_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
 #include "RAJA/util/StaticLayout.hpp"
 #include "RAJA/util/TypedViewBase.hpp"
 
@@ -31,8 +30,7 @@ namespace RAJA
 {
 
 
-
-template<camp::idx_t ... Sizes>
+template <camp::idx_t... Sizes>
 using ParamList = camp::idx_seq<Sizes...>;
 
 /*!
@@ -51,79 +49,84 @@ using ParamList = camp::idx_seq<Sizes...>;
  */
 
 
-namespace internal {
-
-
+namespace internal
+{
 
-  template<typename Perm, typename Sizes>
-  struct StaticLayoutHelper;
 
-  template<camp::idx_t ... Perm, Index_type ...Sizes>
-  struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>>{
-      using type =  StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
-  };
+template <typename Perm, typename Sizes>
+struct StaticLayoutHelper;
 
-  template<typename Perm, typename Sizes>
-  using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
+template <camp::idx_t... Perm, Index_type... Sizes>
+struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>> {
+  using type = StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
+};
 
+template <typename Perm, typename Sizes>
+using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
 
 
-}
+}  // namespace internal
 
 
-template<typename ValueType, typename Perm, typename Sizes, typename... IndexTypes>
+template <typename ValueType,
+          typename Perm,
+          typename Sizes,
+          typename... IndexTypes>
 using TypedLocalArray =
-    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, camp::list<IndexTypes...> >;
+    internal::TypedViewBase<ValueType,
+                            ValueType *,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            camp::list<IndexTypes...>>;
 
 
-template<typename ValueType, typename Perm, typename Sizes>
+template <typename ValueType, typename Perm, typename Sizes>
 using LocalArray =
-    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, internal::getDefaultIndexTypes<Perm> >;
-
+    internal::TypedViewBase<ValueType,
+                            ValueType *,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            internal::getDefaultIndexTypes<Perm>>;
 
 
-
-
-template<typename AtomicPolicy, typename DataType, typename Perm,
-         typename Sizes, typename ... IndexTypes>
+template <typename AtomicPolicy,
+          typename DataType,
+          typename Perm,
+          typename Sizes,
+          typename... IndexTypes>
 struct AtomicTypedLocalArray {
 };
 
-template<typename AtomicPolicy, typename DataType, camp::idx_t ... Perm,
-          Index_type ... Sizes, typename ... IndexTypes>
-struct AtomicTypedLocalArray<AtomicPolicy, DataType, camp::idx_seq<Perm ...>,
-                             RAJA::SizeList<Sizes ...>, IndexTypes ...>{
+template <typename AtomicPolicy,
+          typename DataType,
+          camp::idx_t... Perm,
+          Index_type... Sizes,
+          typename... IndexTypes>
+struct AtomicTypedLocalArray<AtomicPolicy,
+                             DataType,
+                             camp::idx_seq<Perm...>,
+                             RAJA::SizeList<Sizes...>,
+                             IndexTypes...> {
   DataType *m_arrayPtr = nullptr;
   using value_type = DataType;
   using atomic_ref_t = RAJA::AtomicRef<value_type, AtomicPolicy>;
-  using layout_type = RAJA::StaticLayout<camp::idx_seq<Perm ...>, Sizes ...>;
+  using layout_type = RAJA::StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
   static const camp::idx_t NumElem = layout_type::s_size;
 
   RAJA_HOST_DEVICE
-  atomic_ref_t operator()(IndexTypes ... indices) const
+  atomic_ref_t operator()(IndexTypes... indices) const
   {
-    return(atomic_ref_t(&m_arrayPtr[layout_type::s_oper(stripIndexType(indices)
-                                                     ...)]));
+    return (atomic_ref_t(
+        &m_arrayPtr[layout_type::s_oper(stripIndexType(indices)...)]));
   }
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr
-  camp::idx_t size() const
-  {
-    return layout_type::s_size;
-  }
+  constexpr camp::idx_t size() const { return layout_type::s_size; }
 
   RAJA_HOST_DEVICE
-  RAJA_INLINE void set_data(DataType * data_ptr){
-    m_arrayPtr = data_ptr;
-  }
+  RAJA_INLINE void set_data(DataType *data_ptr) { m_arrayPtr = data_ptr; }
 };
 
 
-
-
-
 }  // end namespace RAJA
 
 
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 827515062e..8b2731935c 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -19,17 +19,14 @@
 #ifndef RAJA_OFFSETLAYOUT_HPP
 #define RAJA_OFFSETLAYOUT_HPP
 
-#include "RAJA/config.hpp"
-
 #include <array>
 #include <limits>
 
-#include "camp/camp.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/index/IndexValue.hpp"
-
 #include "RAJA/util/Permutations.hpp"
 #include "RAJA/util/PermutedLayout.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -51,7 +48,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
   static constexpr camp::idx_t stride_one_dim = Base::stride_one_dim;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
-  IdxLin offsets[n_dims]={0}; //If not specified set to zero
+  IdxLin offsets[n_dims] = {0};  // If not specified set to zero
 
   constexpr RAJA_INLINE OffsetLayout_impl(
       std::array<IdxLin, sizeof...(RangeInts)> begin,
@@ -68,15 +65,18 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
 
   void shift(std::array<IdxLin, sizeof...(RangeInts)> shift)
   {
-    for(size_t i=0; i<n_dims; ++i) offsets[i] += shift[i];
+    for (size_t i = 0; i < n_dims; ++i)
+      offsets[i] += shift[i];
   }
 
-  template<camp::idx_t N, typename Idx>
+  template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
-           static_cast<int>(N), static_cast<long int>(idx),
-           static_cast<long int>(offsets[N]), static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
+           static_cast<int>(N),
+           static_cast<long int>(idx),
+           static_cast<long int>(offsets[N]),
+           static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
@@ -86,21 +86,21 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
   }
 
   template <camp::idx_t N, typename Idx, typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx, Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
+                                                Indices... indices) const
   {
-    if(!(offsets[N] <=idx && idx < offsets[N] + base_.sizes[N]))
-    {
+    if (!(offsets[N] <= idx && idx < offsets[N] + base_.sizes[N])) {
       BoundsCheckError<N>(idx);
     }
     RAJA_UNUSED_VAR(idx);
-    BoundsCheck<N+1>(indices...);
+    BoundsCheck<N + 1>(indices...);
   }
 
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin operator()(
-      Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
+  operator()(Indices... indices) const
   {
-#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     return base_((indices - offsets[RangeInts])...);
@@ -108,7 +108,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
 
   template <typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices &&... indices) const
+                                              Indices&&... indices) const
   {
     base_.toIndices(linear_index, std::forward<Indices>(indices)...);
     camp::sink((indices = (offsets[RangeInts] + indices))...);
@@ -140,27 +140,21 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
     return base_.size_noproj();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return base_.get_dim_stride();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return base_.get_dim_size();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return offsets[DIM];
   }
 };
@@ -184,47 +178,47 @@ struct OffsetLayout
   }
 };
 
-//TypedOffsetLayout
+// TypedOffsetLayout
 template <typename IdxLin, typename DimTuple>
 struct TypedOffsetLayout;
 
 template <typename IdxLin, typename... DimTypes>
 struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
-: public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
-{
-   using StrippedIdxLin = strip_index_type_t<IdxLin>;
-   using Self = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
-   using Base = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
-   using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
-   using DimTuple = camp::tuple<DimTypes...>;
-   using IndexLinear = IdxLin;
-
-   // Pull in base coonstructors
- #if 0
+    : public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>> {
+  using StrippedIdxLin = strip_index_type_t<IdxLin>;
+  using Self = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+  using Base = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
+  using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
+  using DimTuple = camp::tuple<DimTypes...>;
+  using IndexLinear = IdxLin;
+
+  // Pull in base coonstructors
+#if 0
    // This breaks with nvcc11
  using Base::Base;
- #else
-   using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
- #endif
+#else
+  using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
+#endif
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(DimTypes... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
+      DimTypes... indices) const
   {
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
 
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes &... indices) const
+                                              DimTypes&... indices) const
   {
     toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
                     std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes &>(indices)...);
+                    std::forward<DimTypes&>(indices)...);
   }
 
 private:
   template <typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices &... indices) const
+                                                    Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
diff --git a/include/RAJA/util/OffsetOperators.hpp b/include/RAJA/util/OffsetOperators.hpp
index 150aaeee34..2d49dab447 100644
--- a/include/RAJA/util/OffsetOperators.hpp
+++ b/include/RAJA/util/OffsetOperators.hpp
@@ -19,7 +19,6 @@
 #define RAJA_OFFSETOPERATORS_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/util/concepts.hpp"
 #include "RAJA/util/macros.hpp"
 
@@ -27,54 +26,66 @@ namespace RAJA
 {
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct GetOffsetLeft
-{
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+struct GetOffsetLeft {
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
   using rebind = GetOffsetLeft<new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t >
+  template <size_t>
   using rebunch = GetOffsetLeft<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& num_i,
-                 Arg2 const& j, Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
+      Arg1 const& i,
+      Arg1 const& num_i,
+      Arg2 const& j,
+      Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
   {
     return i + j * num_i;
   }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct GetOffsetRight
-{
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+struct GetOffsetRight {
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
   using rebind = GetOffsetRight<new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t >
+  template <size_t>
   using rebunch = GetOffsetRight<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
-                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
+      Arg1 const& i,
+      Arg1 const& RAJA_UNUSED_ARG(num_i),
+      Arg2 const& j,
+      Arg2 const& num_j) const noexcept
   {
     return i * num_j + j;
   }
 };
 
 template <size_t t_bunch_num_i,
-          typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct GetOffsetLeftBunched
-{
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
-  using rebind = GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
-
-  template < size_t new_bunch_num_i >
+          typename Ret,
+          typename Arg1 = Ret,
+          typename Arg2 = Arg1>
+struct GetOffsetLeftBunched {
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
+  using rebind =
+      GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
+
+  template <size_t new_bunch_num_i>
   using rebunch = GetOffsetLeftBunched<new_bunch_num_i, Ret, Arg1, Arg2>;
 
   static constexpr Arg1 bunch_num_i{t_bunch_num_i};
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
-                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
+      Arg1 const& i,
+      Arg1 const& RAJA_UNUSED_ARG(num_i),
+      Arg2 const& j,
+      Arg2 const& num_j) const noexcept
   {
     // assert(num_i >= bunch_num_i)
     Arg1 i_inner = i % bunch_num_i;
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index b4249e7182..1048c4bb1b 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -20,12 +20,13 @@
 #ifndef RAJA_operators_HPP
 #define RAJA_operators_HPP
 
-#include "RAJA/config.hpp"
-
 #include <stdint.h>
+
 #include <cfloat>
 #include <cstdint>
 #include <type_traits>
+
+#include "RAJA/config.hpp"
 #if defined(RAJA_CHECK_LIMITS)
 #include <limits>
 #endif
@@ -51,10 +52,11 @@ struct fp_associative_tag : associative_tag {
 };
 
 // get associativity tag appropriate for the type
-template < typename T >
+template <typename T>
 using associative_or_fp_associative_tag =
-  std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
-                     fp_associative_tag, associative_tag>;
+    std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
+                       fp_associative_tag,
+                       associative_tag>;
 
 template <typename Arg1, typename Arg2, typename Result>
 struct binary_function {
@@ -218,7 +220,6 @@ struct larger_of {
 }  // namespace types
 
 
-
 template <typename T, typename Enable = void>
 struct limits;
 
@@ -226,27 +227,26 @@ struct limits;
 // limits for signed integer types
 template <typename T>
 struct limits<T,
-  typename std::enable_if<std::is_integral<T>::value &&
-  !std::is_unsigned<T>::value>::type>
-{
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      !std::is_unsigned<T>::value>::type> {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
-    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu) );
+    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu));
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
     return static_cast<T>(~(1llu << ((8llu * sizeof(T)) - 1llu)));
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
 };
@@ -254,9 +254,8 @@ struct limits<T,
 // limits for signed integer types
 template <typename T>
 struct limits<T,
-  typename std::enable_if<std::is_integral<T>::value &&
-  std::is_unsigned<T>::value>::type>
-{
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      std::is_unsigned<T>::value>::type> {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
     return static_cast<T>(0);
@@ -264,11 +263,11 @@ struct limits<T,
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
     return static_cast<T>(0xFFFFFFFFFFFFFFFF);
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
 };
@@ -276,14 +275,8 @@ struct limits<T,
 
 template <>
 struct limits<float> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min()
-  {
-    return -FLT_MAX;
-  }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max()
-  {
-    return FLT_MAX;
-  }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min() { return -FLT_MAX; }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max() { return FLT_MAX; }
 };
 
 template <>
@@ -292,10 +285,7 @@ struct limits<double> {
   {
     return -DBL_MAX;
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() 
-  { 
-     return DBL_MAX; 
-  }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() { return DBL_MAX; }
 };
 
 template <>
@@ -428,7 +418,7 @@ struct bit_or : public detail::binary_function<Arg1, Arg2, Ret> {
     return lhs | rhs;
   }
 
-RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
@@ -439,7 +429,7 @@ struct bit_and : public detail::binary_function<Arg1, Arg2, Ret> {
     return lhs & rhs;
   }
 
-RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret{0}; }
 };
 
 
@@ -617,7 +607,8 @@ namespace detail
 {
 
 template <typename Fun, typename Ret, typename T, typename U>
-using is_binary_function = ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
+using is_binary_function =
+    ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
 
 template <typename Fun, typename Ret, typename T>
 using is_unary_function = ::RAJA::concepts::requires_<UnaryFunction, Ret, T>;
diff --git a/include/RAJA/util/Permutations.hpp b/include/RAJA/util/Permutations.hpp
index e79e9f2830..1b703b307d 100644
--- a/include/RAJA/util/Permutations.hpp
+++ b/include/RAJA/util/Permutations.hpp
@@ -18,10 +18,9 @@
 #ifndef RAJA_FORALLN_PERMUTATIONS_HPP
 #define RAJA_FORALLN_PERMUTATIONS_HPP
 
-#include "RAJA/config.hpp"
-
 #include <array>
 
+#include "RAJA/config.hpp"
 #include "camp/camp.hpp"
 
 namespace RAJA
@@ -193,51 +192,49 @@ using PERM_MLKIJ = camp::idx_seq<4, 3, 2, 0, 1>;
 using PERM_MLKJI = camp::idx_seq<4, 3, 2, 1, 0>;
 
 
-
-
-namespace internal 
+namespace internal
 {
 
 
-template<camp::idx_t I, camp::idx_t J, camp::idx_t N, typename Perm>
-struct CalcInversePermutationElem
-{
-  static constexpr camp::idx_t value = 
-    camp::seq_at<J, Perm>::value == I ? J : CalcInversePermutationElem<I, J+1, N, Perm>::value;
+template <camp::idx_t I, camp::idx_t J, camp::idx_t N, typename Perm>
+struct CalcInversePermutationElem {
+  static constexpr camp::idx_t value =
+      camp::seq_at<J, Perm>::value == I
+          ? J
+          : CalcInversePermutationElem<I, J + 1, N, Perm>::value;
 };
 
-template<camp::idx_t I, camp::idx_t N, typename Perm>
-struct CalcInversePermutationElem<I, N, N, Perm>
-{
+template <camp::idx_t I, camp::idx_t N, typename Perm>
+struct CalcInversePermutationElem<I, N, N, Perm> {
   static constexpr camp::idx_t value = I;
 };
 
 
-
-template<typename Range, typename Perm>
+template <typename Range, typename Perm>
 struct InversePermutationHelper;
 
-template<camp::idx_t ... Range, camp::idx_t ... Perm>
-struct InversePermutationHelper<camp::idx_seq<Range...>, 
-                                camp::idx_seq<Perm...>>
-{
+template <camp::idx_t... Range, camp::idx_t... Perm>
+struct InversePermutationHelper<camp::idx_seq<Range...>,
+                                camp::idx_seq<Perm...>> {
   static_assert(sizeof...(Range) == sizeof...(Perm), "Fatal Error");
-  using type = camp::idx_seq< 
-    CalcInversePermutationElem<Range, 0, sizeof...(Range), camp::idx_seq<Perm...>>::value ...  
-  >;  
+  using type = camp::idx_seq<
+      CalcInversePermutationElem<Range,
+                                 0,
+                                 sizeof...(Range),
+                                 camp::idx_seq<Perm...>>::value...>;
 };
 
 
-
-} // namespace internal
-
+}  // namespace internal
 
 
 /*!
   Inverts a permutation
 */
-template<typename Perm>
-using invert_permutation = typename internal::InversePermutationHelper<camp::make_idx_seq_t<camp::size<Perm>::value>, Perm>::type;
+template <typename Perm>
+using invert_permutation = typename internal::InversePermutationHelper<
+    camp::make_idx_seq_t<camp::size<Perm>::value>,
+    Perm>::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index 5bb176215b..06979094ac 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -19,12 +19,10 @@
 #ifndef RAJA_PERMUTEDLAYOUT_HPP
 #define RAJA_PERMUTEDLAYOUT_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 
+#include "RAJA/config.hpp"
 #include "RAJA/index/IndexValue.hpp"
-
 #include "RAJA/util/Layout.hpp"
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/Permutations.hpp"
@@ -81,7 +79,7 @@ auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
 
 
   // return Layout<Rank, IdxLin>(sizes, strides);
-  auto ret  = Layout<Rank, IdxLin>();
+  auto ret = Layout<Rank, IdxLin>();
   for (size_t i = 0; i < Rank; ++i) {
     ret.sizes[i] = sizes[i];
     ret.strides[i] = strides[i];
diff --git a/include/RAJA/util/PluginContext.hpp b/include/RAJA/util/PluginContext.hpp
index 996836e397..e198a20983 100644
--- a/include/RAJA/util/PluginContext.hpp
+++ b/include/RAJA/util/PluginContext.hpp
@@ -8,34 +8,35 @@
 #ifndef RAJA_plugin_context_HPP
 #define RAJA_plugin_context_HPP
 
-#include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/internal/get_platform.hpp"
+#include "RAJA/policy/PolicyBase.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 class KokkosPluginLoader;
 
 struct PluginContext {
-  public:
-    PluginContext(const Platform p) :
-      platform(p) {}
+public:
+  PluginContext(const Platform p) : platform(p) {}
 
-    Platform platform;
+  Platform platform;
 
-  private:
-    mutable uint64_t kID;
+private:
+  mutable uint64_t kID;
 
-    friend class KokkosPluginLoader;
+  friend class KokkosPluginLoader;
 };
 
-template<typename Policy>
+template <typename Policy>
 PluginContext make_context()
 {
   return PluginContext{detail::get_platform<Policy>::value};
 }
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginLinker.hpp b/include/RAJA/util/PluginLinker.hpp
index e5b77bd027..b9b838baa8 100644
--- a/include/RAJA/util/PluginLinker.hpp
+++ b/include/RAJA/util/PluginLinker.hpp
@@ -8,17 +8,20 @@
 #ifndef RAJA_Plugin_Linker_HPP
 #define RAJA_Plugin_Linker_HPP
 
-#include "RAJA/util/RuntimePluginLoader.hpp"
 #include "RAJA/util/KokkosPluginLoader.hpp"
+#include "RAJA/util/RuntimePluginLoader.hpp"
 
-namespace {
-  namespace anonymous_RAJA {
-    struct pluginLinker {
-      inline pluginLinker() {
-        (void)RAJA::util::linkRuntimePluginLoader();
-        (void)RAJA::util::linkKokkosPluginLoader();
-      }
-    } pluginLinker;
+namespace
+{
+namespace anonymous_RAJA
+{
+struct pluginLinker {
+  inline pluginLinker()
+  {
+    (void)RAJA::util::linkRuntimePluginLoader();
+    (void)RAJA::util::linkKokkosPluginLoader();
   }
-}
+} pluginLinker;
+}  // namespace anonymous_RAJA
+}  // namespace
 #endif
diff --git a/include/RAJA/util/PluginOptions.hpp b/include/RAJA/util/PluginOptions.hpp
index f0b6a35507..0b1891fd2b 100644
--- a/include/RAJA/util/PluginOptions.hpp
+++ b/include/RAJA/util/PluginOptions.hpp
@@ -10,22 +10,23 @@
 
 #include <string>
 
-namespace RAJA {
-namespace util {
-
-struct PluginOptions
+namespace RAJA
+{
+namespace util
 {
-    PluginOptions(const std::string& newstr) : str(newstr) {};
-    
-    std::string str;
+
+struct PluginOptions {
+  PluginOptions(const std::string& newstr) : str(newstr){};
+
+  std::string str;
 };
 
 inline PluginOptions make_options(const std::string& newstr)
 {
-    return PluginOptions{newstr};
+  return PluginOptions{newstr};
 }
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginStrategy.hpp b/include/RAJA/util/PluginStrategy.hpp
index 3935559bba..86f8fd7f6b 100644
--- a/include/RAJA/util/PluginStrategy.hpp
+++ b/include/RAJA/util/PluginStrategy.hpp
@@ -12,33 +12,35 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/Registry.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 class PluginStrategy
 {
-  public:
-    RAJASHAREDDLL_API PluginStrategy();
+public:
+  RAJASHAREDDLL_API PluginStrategy();
 
-    virtual ~PluginStrategy() = default;
+  virtual ~PluginStrategy() = default;
 
-    virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
+  virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
 
-    virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void finalize();
+  virtual RAJASHAREDDLL_API void finalize();
 };
 
 using PluginRegistry = Registry<PluginStrategy>;
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/util/Registry.hpp b/include/RAJA/util/Registry.hpp
index 579481a6ed..9a82cda8c8 100644
--- a/include/RAJA/util/Registry.hpp
+++ b/include/RAJA/util/Registry.hpp
@@ -10,126 +10,148 @@
 
 #include <memory>
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
+
+template <typename T>
+class RegistryEntry
+{
+  std::string Name, Desc;
+  std::shared_ptr<T> object;
+
+public:
+  RegistryEntry(const std::string &N,
+                const std::string &D,
+                std::shared_ptr<T> (*C)())
+      : Name(N), Desc(D), object(C())
+  {
+  }
 
-  template <typename T>
-  class RegistryEntry {
-    std::string Name, Desc;
-    std::shared_ptr<T> object;
+  const std::string &getName() const { return Name; }
+  const std::string &getDesc() const { return Desc; }
+  T *get() const { return object.get(); }
+};
+
+/// A global registry used in conjunction with static constructors to make
+/// pluggable components (like targets or garbage collectors) "just work" when
+/// linked with an executable.
+template <typename T>
+class Registry
+{
+public:
+  using type = T;
+  using entry = RegistryEntry<T>;
+
+  class node;
+  class iterator;
+
+private:
+  Registry() = delete;
+
+  friend class node;
+  static node *Head, *Tail;
+
+public:
+  /// Node in linked list of entries.
+  ///
+  class node
+  {
+    friend class iterator;
+    friend Registry<T>;
+
+    node *Next;
+    const entry &Val;
 
   public:
-    RegistryEntry(const std::string& N, const std::string& D,
-        std::shared_ptr<T> (*C)())
-        : Name(N), Desc(D), object(C()) {}
-
-    const std::string& getName() const { return Name; }
-    const std::string& getDesc() const { return Desc; }
-    T* get() const { return object.get(); }
+    node(const entry &V) : Next(nullptr), Val(V) {}
   };
 
-  /// A global registry used in conjunction with static constructors to make
-  /// pluggable components (like targets or garbage collectors) "just work" when
-  /// linked with an executable.
-  template <typename T>
-  class Registry {
+  /// Add a node to the Registry: this is the interface between the plugin and
+  /// the executable.
+  ///
+  /// This function is exported by the executable and called by the plugin to
+  /// add a node to the executable's registry. Therefore it's not defined here
+  /// to avoid it being instantiated in the plugin and is instead defined in
+  /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
+  static RAJASHAREDDLL_API void add_node(node *N);
+
+  /// Iterators for registry entries.
+  ///
+  class iterator
+  {
+    const node *Cur;
+
   public:
-    using type = T;
-    using entry = RegistryEntry<T>;
+    explicit iterator(const node *N) : Cur(N) {}
+
+    bool operator==(const iterator &That) const { return Cur == That.Cur; }
+    bool operator!=(const iterator &That) const { return Cur != That.Cur; }
+    iterator &operator++()
+    {
+      Cur = Cur->Next;
+      return *this;
+    }
+    const entry &operator*() const { return Cur->Val; }
+    const entry *operator->() const { return &Cur->Val; }
+  };
 
-    class node;
-    class iterator;
+  // begin is not defined here in order to avoid usage of an undefined static
+  // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
+  static RAJASHAREDDLL_API iterator begin();
+  static iterator end() { return iterator(nullptr); }
 
-  private:
-    Registry() = delete;
+  /// A static registration template.
+  template <typename V>
+  class add
+  {
+    entry Entry;
+    node Node;
 
-    friend class node;
-    static node *Head, *Tail;
+    static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
 
   public:
-    /// Node in linked list of entries.
-    ///
-    class node {
-      friend class iterator;
-      friend Registry<T>;
-
-      node *Next;
-      const entry& Val;
-
-    public:
-      node(const entry &V) : Next(nullptr), Val(V) {}
-    };
-
-    /// Add a node to the Registry: this is the interface between the plugin and
-    /// the executable.
-    ///
-    /// This function is exported by the executable and called by the plugin to
-    /// add a node to the executable's registry. Therefore it's not defined here
-    /// to avoid it being instantiated in the plugin and is instead defined in
-    /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
-    static RAJASHAREDDLL_API void add_node(node *N);
-
-    /// Iterators for registry entries.
-    ///
-    class iterator {
-      const node *Cur;
-
-    public:
-      explicit iterator(const node *N) : Cur(N) {}
-
-      bool operator==(const iterator &That) const { return Cur == That.Cur; }
-      bool operator!=(const iterator &That) const { return Cur != That.Cur; }
-      iterator &operator++() { Cur = Cur->Next; return *this; }
-      const entry &operator*() const { return Cur->Val; }
-      const entry *operator->() const { return &Cur->Val; }
-    };
-
-    // begin is not defined here in order to avoid usage of an undefined static
-    // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
-    static RAJASHAREDDLL_API iterator begin();
-    static iterator end()   { return iterator(nullptr); }
-
-    /// A static registration template.
-    template <typename V>
-    class add {
-      entry Entry;
-      node Node;
-
-      static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
-
-    public:
-      add(const std::string& Name, const std::string& Desc)
-          : Entry(Name, Desc, CtorFn), Node(Entry) {
-        add_node(&Node);
-      }
-    };
+    add(const std::string &Name, const std::string &Desc)
+        : Entry(Name, Desc, CtorFn), Node(Entry)
+    {
+      add_node(&Node);
+    }
   };
-
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
-
-#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \
-  namespace RAJA { \
-  namespace util { \
-  template<typename T> typename Registry<T>::node *Registry<T>::Head = nullptr;\
-  template<typename T> typename Registry<T>::node *Registry<T>::Tail = nullptr;\
-  template<typename T> \
-  void Registry<T>::add_node(typename Registry<T>::node *N) { \
-    if (Tail) \
-      Tail->Next = N; \
-    else \
-      Head = N; \
-    Tail = N; \
-  } \
-  template<typename T> typename Registry<T>::iterator Registry<T>::begin() { \
-    return iterator(Head); \
-  } \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head; \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail; \
-  template \
-  void Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*); \
+};
+
+}  // namespace util
+}  // namespace RAJA
+
+#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                            \
+  namespace RAJA                                                             \
+  {                                                                          \
+  namespace util                                                             \
+  {                                                                          \
+  template <typename T>                                                      \
+  typename Registry<T>::node *Registry<T>::Head = nullptr;                   \
+  template <typename T>                                                      \
+  typename Registry<T>::node *Registry<T>::Tail = nullptr;                   \
+  template <typename T>                                                      \
+  void Registry<T>::add_node(typename Registry<T>::node *N)                  \
+  {                                                                          \
+    if (Tail)                                                                \
+      Tail->Next = N;                                                        \
+    else                                                                     \
+      Head = N;                                                              \
+    Tail = N;                                                                \
+  }                                                                          \
+  template <typename T>                                                      \
+  typename Registry<T>::iterator Registry<T>::begin()                        \
+  {                                                                          \
+    return iterator(Head);                                                   \
+  }                                                                          \
+  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head;       \
+  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail;       \
+  template void Registry<REGISTRY_CLASS::type>::add_node(                    \
+      REGISTRY_CLASS::node *);                                               \
   template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin(); \
-  } \
+  }                                                                          \
   }
 
 #endif
diff --git a/include/RAJA/util/RepeatView.hpp b/include/RAJA/util/RepeatView.hpp
index 618913f794..b2b78c7589 100644
--- a/include/RAJA/util/RepeatView.hpp
+++ b/include/RAJA/util/RepeatView.hpp
@@ -19,8 +19,8 @@
 #define RAJA_REPEATVIEW_HPP
 
 #include <cstddef>
-#include <utility>
 #include <type_traits>
+#include <utility>
 
 #include "RAJA/util/macros.hpp"
 
@@ -50,11 +50,9 @@ namespace RAJA
  *   unbounded extents
  *
  */
-template < typename T >
-struct RepeatView
-{
-  struct iterator
-  {
+template <typename T>
+struct RepeatView {
+  struct iterator {
     using difference_type = std::ptrdiff_t;
     using value_type = T;
     using reference = value_type const&;
@@ -62,44 +60,99 @@ struct RepeatView
     iterator() = default;
 
     constexpr iterator(const T* base, size_t index)
-      : m_value(base), m_index(index)
-    { }
+        : m_value(base), m_index(index)
+    {
+    }
 
     constexpr reference operator*() const noexcept { return *m_value; }
-    constexpr reference operator[](difference_type index) const noexcept { return *(*this + index); }
-
-    constexpr iterator& operator++() { ++m_index; return *this; }
-    constexpr iterator operator++(int) { auto tmp = *this; ++(*this); return tmp; }
-
-    constexpr iterator& operator--() { --m_index; return *this; }
-    constexpr iterator operator--(int) { auto tmp = *this; --(*this); return tmp; }
-
-    constexpr iterator& operator+=(difference_type rhs) { m_index += rhs; return *this; }
-    constexpr iterator& operator-=(difference_type rhs) { m_index -= rhs; return *this; }
+    constexpr reference operator[](difference_type index) const noexcept
+    {
+      return *(*this + index);
+    }
+
+    constexpr iterator& operator++()
+    {
+      ++m_index;
+      return *this;
+    }
+    constexpr iterator operator++(int)
+    {
+      auto tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    constexpr iterator& operator--()
+    {
+      --m_index;
+      return *this;
+    }
+    constexpr iterator operator--(int)
+    {
+      auto tmp = *this;
+      --(*this);
+      return tmp;
+    }
+
+    constexpr iterator& operator+=(difference_type rhs)
+    {
+      m_index += rhs;
+      return *this;
+    }
+    constexpr iterator& operator-=(difference_type rhs)
+    {
+      m_index -= rhs;
+      return *this;
+    }
 
     friend constexpr iterator operator+(iterator lhs, difference_type rhs)
-    { lhs += rhs; return lhs; }
+    {
+      lhs += rhs;
+      return lhs;
+    }
     friend constexpr iterator operator+(difference_type lhs, iterator rhs)
-    { rhs += lhs; return rhs; }
+    {
+      rhs += lhs;
+      return rhs;
+    }
 
     friend constexpr iterator operator-(iterator lhs, difference_type rhs)
-    { lhs -= rhs; return lhs; }
-    friend constexpr difference_type operator-(iterator const& lhs, iterator const& rhs)
-    { return static_cast<difference_type>(lhs.m_index) - static_cast<difference_type>(rhs.m_index); }
+    {
+      lhs -= rhs;
+      return lhs;
+    }
+    friend constexpr difference_type operator-(iterator const& lhs,
+                                               iterator const& rhs)
+    {
+      return static_cast<difference_type>(lhs.m_index) -
+             static_cast<difference_type>(rhs.m_index);
+    }
 
     friend constexpr bool operator==(iterator const& lhs, iterator const& rhs)
-    { return lhs.m_index == rhs.m_index; }
+    {
+      return lhs.m_index == rhs.m_index;
+    }
     friend constexpr bool operator!=(iterator const& lhs, iterator const& rhs)
-    { return !(lhs == rhs); }
+    {
+      return !(lhs == rhs);
+    }
 
     friend constexpr bool operator<(iterator const& lhs, iterator const& rhs)
-    { return lhs.m_index < rhs.m_index; }
+    {
+      return lhs.m_index < rhs.m_index;
+    }
     friend constexpr bool operator<=(iterator const& lhs, iterator const& rhs)
-    { return !(rhs < lhs); }
+    {
+      return !(rhs < lhs);
+    }
     friend constexpr bool operator>(iterator const& lhs, iterator const& rhs)
-    { return rhs < lhs; }
+    {
+      return rhs < lhs;
+    }
     friend constexpr bool operator>=(iterator const& lhs, iterator const& rhs)
-    { return !(lhs < rhs); }
+    {
+      return !(lhs < rhs);
+    }
 
   private:
     const T* m_value = nullptr;
@@ -109,16 +162,21 @@ struct RepeatView
   RepeatView() = delete;
 
   constexpr RepeatView(T const& value, size_t bound)
-    : m_bound(bound), m_value(value)
-  { }
+      : m_bound(bound), m_value(value)
+  {
+  }
 
   constexpr RepeatView(T&& value, size_t bound)
-    : m_bound(bound), m_value(std::move(value))
-  { }
+      : m_bound(bound), m_value(std::move(value))
+  {
+  }
 
   constexpr T const& front() const { return m_value; }
   constexpr T const& back() const { return m_value; }
-  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const { return m_value; }
+  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const
+  {
+    return m_value;
+  }
 
   constexpr iterator begin() const { return iterator(&m_value, 0); }
   constexpr iterator cbegin() const { return iterator(&m_value, 0); }
diff --git a/include/RAJA/util/RuntimePluginLoader.hpp b/include/RAJA/util/RuntimePluginLoader.hpp
index 3e7fbb165f..289e067b0a 100644
--- a/include/RAJA/util/RuntimePluginLoader.hpp
+++ b/include/RAJA/util/RuntimePluginLoader.hpp
@@ -14,39 +14,40 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
-  class RuntimePluginLoader : public RAJA::util::PluginStrategy
-  {
-    using Parent = RAJA::util::PluginStrategy;
+class RuntimePluginLoader : public RAJA::util::PluginStrategy
+{
+  using Parent = RAJA::util::PluginStrategy;
 
-  public:
-    RuntimePluginLoader();
+public:
+  RuntimePluginLoader();
 
-    void init(const RAJA::util::PluginOptions& p) override;
+  void init(const RAJA::util::PluginOptions& p) override;
 
-    void preCapture(const RAJA::util::PluginContext& p) override;
+  void preCapture(const RAJA::util::PluginContext& p) override;
 
-    void postCapture(const RAJA::util::PluginContext& p) override;
+  void postCapture(const RAJA::util::PluginContext& p) override;
 
-    void preLaunch(const RAJA::util::PluginContext& p) override;
+  void preLaunch(const RAJA::util::PluginContext& p) override;
 
-    void postLaunch(const RAJA::util::PluginContext& p) override;
+  void postLaunch(const RAJA::util::PluginContext& p) override;
 
-    void finalize() override;
+  void finalize() override;
 
-  private:
+private:
+  void initPlugin(const std::string& path);
 
-    void initPlugin(const std::string &path);
-    
-    void initDirectory(const std::string &path);
+  void initDirectory(const std::string& path);
 
-    std::vector<std::unique_ptr<Parent>> plugins;
+  std::vector<std::unique_ptr<Parent>> plugins;
 
-  };  // end RuntimePluginLoader class
+};  // end RuntimePluginLoader class
 
-  void linkRuntimePluginLoader();
+void linkRuntimePluginLoader();
 
 }  // end namespace util
 }  // end namespace RAJA
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index 6adea65b80..87f46c7bca 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -18,10 +18,10 @@
 #ifndef RAJA_SOA_PTR_HPP
 #define RAJA_SOA_PTR_HPP
 
-#include "RAJA/config.hpp"
-
 #include <type_traits>
 
+#include "RAJA/config.hpp"
+
 // for RAJA::reduce::detail::ValueLoc
 #include "RAJA/pattern/detail/reduce.hpp"
 #include "RAJA/util/types.hpp"
@@ -45,35 +45,37 @@ namespace detail
 template <typename T,
           typename mempool = RAJA::basic_mempool::MemPool<
               RAJA::basic_mempool::generic_allocator>,
-          typename accessor = DefaultAccessor >
+          typename accessor = DefaultAccessor>
 class SoAPtr
 {
-  template < typename, typename, typename >
-  friend class SoAPtr; // friend other instantiations of this class
+  template <typename, typename, typename>
+  friend class SoAPtr;  // friend other instantiations of this class
 
 public:
   using value_type = T;
 
-  template < typename rhs_accessor >
+  template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<T, mempool, rhs_accessor>;
 
   SoAPtr() = default;
   SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr(SoAPtr&&) = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&) = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<value_type>(size))
   {
   }
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-  { }
+  template <
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem)
+  {
+  }
 
   SoAPtr& allocate(size_t size)
   {
@@ -90,8 +92,14 @@ class SoAPtr
 
   RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
 
-  RAJA_HOST_DEVICE value_type get(size_t i) const { return accessor::get(mem, i); }
-  RAJA_HOST_DEVICE void set(size_t i, value_type val) { accessor::set(mem, i, val); }
+  RAJA_HOST_DEVICE value_type get(size_t i) const
+  {
+    return accessor::get(mem, i);
+  }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  {
+    accessor::set(mem, i, val);
+  }
 
 private:
   value_type* mem = nullptr;
@@ -100,26 +108,32 @@ class SoAPtr
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <typename T, typename IndexType, bool doing_min, typename mempool, typename accessor>
-class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, accessor>
+template <typename T,
+          typename IndexType,
+          bool doing_min,
+          typename mempool,
+          typename accessor>
+class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
+             mempool,
+             accessor>
 {
   using first_type = T;
   using second_type = IndexType;
 
-  template < typename, typename, typename >
-  friend class SoAPtr; // fiend other instantiations of this class
+  template <typename, typename, typename>
+  friend class SoAPtr;  // fiend other instantiations of this class
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
 
-  template < typename rhs_accessor >
+  template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
 
   SoAPtr() = default;
   SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr(SoAPtr&&) = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&) = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
@@ -127,13 +141,14 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, a
   {
   }
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-    , mem_idx(rhs.mem_idx)
-  { }
+  template <
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem), mem_idx(rhs.mem_idx)
+  {
+  }
 
   SoAPtr& allocate(size_t size)
   {
@@ -177,20 +192,20 @@ class SoAPtr<RAJA::expt::ValLoc<T, IndexType>, mempool, accessor>
   using first_type = T;
   using second_type = IndexType;
 
-  template < typename, typename, typename >
-  friend class SoAPtr; // friend other instantiations of this class
+  template <typename, typename, typename>
+  friend class SoAPtr;  // friend other instantiations of this class
 
 public:
   using value_type = RAJA::expt::ValLoc<T, IndexType>;
 
-  template < typename rhs_accessor >
+  template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
 
   SoAPtr() = default;
   SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr(SoAPtr&&) = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&) = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
@@ -198,13 +213,14 @@ class SoAPtr<RAJA::expt::ValLoc<T, IndexType>, mempool, accessor>
   {
   }
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-    , mem_idx(rhs.mem_idx)
-  { }
+  template <
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem), mem_idx(rhs.mem_idx)
+  {
+  }
 
   SoAPtr& allocate(size_t size)
   {
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
index 2da2e0164c..9524abb13c 100644
--- a/include/RAJA/util/Span.hpp
+++ b/include/RAJA/util/Span.hpp
@@ -88,16 +88,34 @@ struct Span {
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const { return m_begin; }
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cend() const { return m_end; }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s) { return s.begin(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s)
+  {
+    return s.begin();
+  }
   RAJA_HOST_DEVICE RAJA_INLINE friend iterator end(Span& s) { return s.end(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s) { return s.begin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s) { return s.end(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s) { return s.cbegin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s) { return s.cend(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s)
+  {
+    return s.begin();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s)
+  {
+    return s.end();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s)
+  {
+    return s.cbegin();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s)
+  {
+    return s.cend();
+  }
 
   RAJA_HOST_DEVICE RAJA_INLINE reference front() const { return *begin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end()-1); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const { return data()[i]; }
+  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end() - 1); }
+  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const
+  {
+    return data()[i];
+  }
   RAJA_HOST_DEVICE RAJA_INLINE iterator data() const { return m_begin; }
 
   RAJA_HOST_DEVICE RAJA_INLINE size_type size() const
@@ -157,21 +175,21 @@ struct Span {
  *
  */
 template <typename IterType, typename IndexType>
-RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(
-    IterType begin,
-    IndexType size)
+RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(IterType begin,
+                                                                 IndexType size)
 {
   return Span<IterType, IndexType>(begin, size);
 }
 
 template <typename Iter>
-RAJA_INLINE auto make_span(Iter &iterable)
+RAJA_INLINE auto make_span(Iter& iterable)
 {
   using std::begin;
-  using std::end;
   using std::distance;
-  return Span<typename Iter::iterator, decltype(distance(begin(iterable), end(iterable)))>
-    (begin(iterable), end(iterable));
+  using std::end;
+  return Span<typename Iter::iterator,
+              decltype(distance(begin(iterable), end(iterable)))>(
+      begin(iterable), end(iterable));
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index 8d27980f83..2cad0faf3b 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -19,20 +19,16 @@
 #ifndef RAJA_util_static_layout_HPP
 #define RAJA_util_static_layout_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <limits>
 
+#include "RAJA/config.hpp"
 #include "RAJA/index/IndexValue.hpp"
-
 #include "RAJA/internal/foldl.hpp"
-
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/Permutations.hpp"
 
 
-
 namespace RAJA
 {
 
@@ -40,7 +36,11 @@ namespace detail
 {
 
 
-template <typename IdxLin, typename Range, typename Sizes, typename Strides, typename DimTypeList=void>
+template <typename IdxLin,
+          typename Range,
+          typename Sizes,
+          typename Strides,
+          typename DimTypeList = void>
 struct StaticLayoutBase_impl;
 
 
@@ -58,9 +58,9 @@ struct StaticLayoutBase_impl<IdxLin,
   using sizes = camp::int_seq<IdxLin, Sizes...>;
   using strides = camp::int_seq<IdxLin, Strides...>;
 
-  static constexpr camp::idx_t stride_one_dim =
-      RAJA::max<camp::idx_t>(
-          (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts) : -1)...);
+  static constexpr camp::idx_t stride_one_dim = RAJA::max<camp::idx_t>(
+      (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts)
+                                                    : -1)...);
 
   static constexpr size_t n_dims = sizeof...(Sizes);
 
@@ -72,9 +72,9 @@ struct StaticLayoutBase_impl<IdxLin,
   RAJA_INLINE static void print()
   {
     camp::sink(printf("StaticLayout: arg%d: size=%d, stride=%d\n",
-                               (int)RangeInts,
-                               (int)Sizes,
-                               (int)Strides)...);
+                      (int)RangeInts,
+                      (int)Sizes,
+                      (int)Strides)...);
   }
 
 
@@ -95,7 +95,8 @@ struct StaticLayoutBase_impl<IdxLin,
 
 
   template <typename... Indices>
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin s_oper(Indices... indices)
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin s_oper(
+      Indices... indices)
   {
     // dot product of strides and indices
     return RAJA::sum<IdxLin>((IdxLin(indices * Strides))...);
@@ -108,8 +109,7 @@ struct StaticLayoutBase_impl<IdxLin,
       RAJA::product<IdxLin>((Sizes == IdxLin(0) ? IdxLin(1) : Sizes)...);
 
   // Multiply together all of the sizes
-  static constexpr IdxLin s_size_noproj =
-      RAJA::product<IdxLin>(Sizes...);
+  static constexpr IdxLin s_size_noproj = RAJA::product<IdxLin>(Sizes...);
 
   /*!
    * Computes a size of the layout's space with projections as size 1.
@@ -137,30 +137,23 @@ struct StaticLayoutBase_impl<IdxLin,
   }
 
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return camp::seq_at<DIM, strides>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return camp::seq_at<DIM, sizes>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
-
 };
 
 template <typename IdxLin, IdxLin N, IdxLin Idx, IdxLin... Sizes>
@@ -189,7 +182,10 @@ struct StrideCalculatorIdx<IdxLin, N, N, Sizes...> {
 template <typename IdxLin, typename Range, typename Perm, typename Sizes>
 struct StrideCalculator;
 
-template <typename IdxLin, IdxLin ... Range, camp::idx_t... Perm, IdxLin... Sizes>
+template <typename IdxLin,
+          IdxLin... Range,
+          camp::idx_t... Perm,
+          IdxLin... Sizes>
 struct StrideCalculator<IdxLin,
                         camp::int_seq<IdxLin, Range...>,
                         camp::idx_seq<Perm...>,
@@ -202,14 +198,20 @@ struct StrideCalculator<IdxLin,
   using perm = camp::idx_seq<Perm...>;
   using inv_perm = invert_permutation<perm>;
 
-  using strides_unperm =
-      camp::int_seq<IdxLin, StrideCalculatorIdx<IdxLin, N, Range, camp::seq_at<Perm, sizes>::value...>::stride...>;
-
-  using strides = camp::int_seq<IdxLin, camp::seq_at<camp::seq_at<Range, inv_perm>::value, strides_unperm>::value...>;
+  using strides_unperm = camp::int_seq<
+      IdxLin,
+      StrideCalculatorIdx<IdxLin,
+                          N,
+                          Range,
+                          camp::seq_at<Perm, sizes>::value...>::stride...>;
+
+  using strides =
+      camp::int_seq<IdxLin,
+                    camp::seq_at<camp::seq_at<Range, inv_perm>::value,
+                                 strides_unperm>::value...>;
 };
 
 
-
 template <typename IdxLin,
           IdxLin... RangeInts,
           IdxLin... Sizes,
@@ -223,15 +225,14 @@ struct StaticLayoutBase_impl<IdxLin,
 
 
   using IndexLinear = IdxLin;
-  using ranges      = camp::int_seq<IdxLin, RangeInts...>;
-  using sizes       = camp::int_seq<IdxLin, Sizes...>;
-  using strides     = camp::int_seq<IdxLin, Strides...>;  
+  using ranges = camp::int_seq<IdxLin, RangeInts...>;
+  using sizes = camp::int_seq<IdxLin, Sizes...>;
+  using strides = camp::int_seq<IdxLin, Strides...>;
 
-  using InnerLayout = StaticLayoutBase_impl<IdxLin,ranges,sizes,strides,void>;
+  using InnerLayout =
+      StaticLayoutBase_impl<IdxLin, ranges, sizes, strides, void>;
 
-  static
-  constexpr
-  camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
+  static constexpr camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
 
   static constexpr IndexLinear n_dims = sizeof...(DimTypes);
   /*!
@@ -261,49 +262,42 @@ struct StaticLayoutBase_impl<IdxLin,
     return s_size_noproj;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return InnerLayout{}.get_dim_stride();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return camp::seq_at<DIM, sizes>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
 
 
   RAJA_INLINE
   static void print() { InnerLayout::print(); }
-
 };
 
 
-
-
-
-template <typename Perm, typename IdxLin, typename Sizes, typename Indexes, typename TypeList>
-struct StaticLayoutMaker
-{
-  using strides = typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
-  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides,TypeList>;
+template <typename Perm,
+          typename IdxLin,
+          typename Sizes,
+          typename Indexes,
+          typename TypeList>
+struct StaticLayoutMaker {
+  using strides =
+      typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
+  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides, TypeList>;
 };
 
 
-
 }  // namespace detail
 
 
@@ -313,20 +307,21 @@ using StaticLayoutT = typename detail::StaticLayoutMaker<
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    void
-    >::type;
+    void>::type;
 
 template <typename Perm, camp::idx_t... Sizes>
 using StaticLayout = StaticLayoutT<Perm, camp::idx_t, Sizes...>;
 
-template <typename Perm, typename IdxLin, typename TypeList, camp::idx_t... Sizes>
+template <typename Perm,
+          typename IdxLin,
+          typename TypeList,
+          camp::idx_t... Sizes>
 using TypedStaticLayout = typename detail::StaticLayoutMaker<
     Perm,
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    TypeList
-    >::type;
+    TypeList>::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/Timer.hpp b/include/RAJA/util/Timer.hpp
index 8c23a2c74d..d084077d7b 100644
--- a/include/RAJA/util/Timer.hpp
+++ b/include/RAJA/util/Timer.hpp
@@ -31,6 +31,7 @@
 #if defined(__bgq__) && (!defined(_LIBCPP_VERSION))
 
 #include <sys/time.h>
+
 #include <chrono>
 
 namespace RAJA
diff --git a/include/RAJA/util/TypeConvert.hpp b/include/RAJA/util/TypeConvert.hpp
index 5cdc019259..6c82cb45f9 100644
--- a/include/RAJA/util/TypeConvert.hpp
+++ b/include/RAJA/util/TypeConvert.hpp
@@ -22,12 +22,11 @@
 #ifndef RAJA_util_TypeConvert_HPP
 #define RAJA_util_TypeConvert_HPP
 
-#include "RAJA/config.hpp"
+#include <string.h>
 
+#include "RAJA/config.hpp"
 #include "RAJA/util/macros.hpp"
 
-#include <string.h>
-
 
 namespace RAJA
 {
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 0d5bed35d6..564dbf190c 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -21,7 +21,6 @@
 #include <type_traits>
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/pattern/atomic.hpp"
 
 #if defined(RAJA_ENABLE_VECTORIZATION)
@@ -29,8 +28,8 @@
 #endif
 
 #include "RAJA/util/Layout.hpp"
-#include "RAJA/util/StaticLayout.hpp"
 #include "RAJA/util/OffsetLayout.hpp"
+#include "RAJA/util/StaticLayout.hpp"
 
 namespace RAJA
 {
@@ -38,705 +37,789 @@ namespace RAJA
 namespace internal
 {
 
-  template<camp::idx_t, typename T>
-  struct IndexToType{
-      using type = T;
-  };
+template <camp::idx_t, typename T>
+struct IndexToType {
+  using type = T;
+};
 
-  template<typename IdxSeq, typename T>
-  struct SequenceToType;
+template <typename IdxSeq, typename T>
+struct SequenceToType;
 
-  template<camp::idx_t ... Perm, typename T>
-  struct SequenceToType<camp::idx_seq<Perm...>, T>{
-      using type =  camp::list<typename IndexToType<Perm, T>::type...>;
-  };
+template <camp::idx_t... Perm, typename T>
+struct SequenceToType<camp::idx_seq<Perm...>, T> {
+  using type = camp::list<typename IndexToType<Perm, T>::type...>;
+};
 
-  template<typename Perm>
-  using getDefaultIndexTypes = typename SequenceToType<Perm, RAJA::Index_type>::type;
+template <typename Perm>
+using getDefaultIndexTypes =
+    typename SequenceToType<Perm, RAJA::Index_type>::type;
 
 
+// Helpers to convert
+// layouts -> OffsetLayouts
+// Typedlayouts -> TypedOffsetLayouts
+template <typename layout>
+struct add_offset {
+  using type = RAJA::OffsetLayout<layout::n_dims>;
+};
 
+template <typename IdxLin, typename... DimTypes>
+struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>> {
+  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+};
 
-  //Helpers to convert
-  //layouts -> OffsetLayouts
-  //Typedlayouts -> TypedOffsetLayouts
-  template<typename layout>
-  struct add_offset
-  {
-    using type = RAJA::OffsetLayout<layout::n_dims>;
-  };
 
-  template<typename IdxLin, typename...DimTypes>
-  struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
-  {
-    using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
-  };
+#if defined(RAJA_ENABLE_VECTORIZATION)
+namespace detail
+{
+/*
+ * Returns the argument number which contains a VectorIndex
+ *
+ * returns -1 if none of the arguments are VectorIndexs
+ */
 
+template <camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
+struct GetTensorArgIdxExpanded;
 
+template <camp::idx_t DIM, typename... ARGS, camp::idx_t... IDX>
+struct GetTensorArgIdxExpanded<DIM,
+                               camp::list<ARGS...>,
+                               camp::idx_seq<IDX...>> {
 
+  static constexpr camp::idx_t value = RAJA::max<camp::idx_t>(
+      (internal::expt::isTensorIndex<ARGS>() &&
+               internal::expt::getTensorDim<ARGS>() == DIM
+           ? IDX
+           : -1)...);
+};
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  namespace detail
-  {
-    /*
-     * Returns the argument number which contains a VectorIndex
-     *
-     * returns -1 if none of the arguments are VectorIndexs
-     */
 
-    template<camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
-    struct GetTensorArgIdxExpanded;
+}  // namespace detail
+#endif
 
-    template<camp::idx_t DIM, typename ... ARGS, camp::idx_t ... IDX>
-    struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>> {
 
-        static constexpr camp::idx_t value =
-            RAJA::max<camp::idx_t>(
-                (internal::expt::isTensorIndex<ARGS>()&&internal::expt::getTensorDim<ARGS>()==DIM ? IDX : -1) ...);
-    };
+/*
+ * Returns the number of arguments which are VectorIndexs
+ */
+template <typename... ARGS>
+struct count_num_tensor_args {
+  static constexpr camp::idx_t value =
+#if defined(RAJA_ENABLE_VECTORIZATION)
+      RAJA::sum<camp::idx_t>(
+          (internal::expt::isTensorIndex<ARGS>() ? 1 : 0)...);
+#else
+      0;  // There should be 0 Tensor indices if not vectorizing.
+#endif
+};
 
+#if defined(RAJA_ENABLE_VECTORIZATION)
+/*
+ * Returns which argument has a vector index
+ */
+template <camp::idx_t DIM, typename... ARGS>
+struct GetTensorArgIdx {
+  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
+      DIM,
+      camp::list<ARGS...>,
+      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
+};
 
-  } // namespace detail
+template <camp::idx_t DIM, typename... ARGS>
+struct GetTensorArgIdx<DIM, camp::list<ARGS...>> {
+  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
+      DIM,
+      camp::list<ARGS...>,
+      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
+};
+
+/*
+ * Returns the beginning index in a vector argument
+ */
+template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
+RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t get_tensor_args_begin(
+    LAYOUT const &layout,
+    ARGS... args)
+{
+  return RAJA::max<camp::idx_t>(
+      internal::expt::getTensorDim<ARGS>() == DIM
+          ? internal::expt::getTensorBegin<ARGS>(
+                args,
+                layout.template get_dim_begin<
+                    GetTensorArgIdx<DIM, ARGS...>::value>())
+          : 0 ...);
+}
+
+/*
+ * Returns the number of elements in the vector argument
+ */
+template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
+RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t get_tensor_args_size(
+    LAYOUT const &layout,
+    ARGS... args)
+{
+  return RAJA::max<camp::idx_t>(
+      internal::expt::getTensorDim<ARGS>() == DIM
+          ? internal::expt::getTensorSize<ARGS>(
+                args,
+                layout.template get_dim_size<
+                    GetTensorArgIdx<DIM, ARGS...>::value>())
+          : 0 ...);
+}
 #endif
 
 
+namespace detail
+{
+
+/*!
+ * Provides conversion of view data to a return type.
+ *
+ * For scalars, this just returns the scalar.
+ *
+ * In the future development, this may return SIMD vectors or matrices using
+ * class specializations.
+ */
+template <typename VecSeq,
+          typename Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper;
+
+
+/*
+ * Specialization for Scalar return types
+ */
+template <typename... Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType> {
+  using return_type = ElementType &;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type make_return(LayoutType const &layout,
+                                           PointerType const &data,
+                                           Args const &...args)
+  {
+    return data[stripIndexType(layout(args...))];
+  }
+};
+
 
-  /*
-   * Returns the number of arguments which are VectorIndexs
-   */
-  template<typename ... ARGS>
-  struct count_num_tensor_args{
-    static constexpr camp::idx_t value =
-#if defined(RAJA_ENABLE_VECTORIZATION)
-        RAJA::sum<camp::idx_t>(
-            (internal::expt::isTensorIndex<ARGS>() ? 1 : 0) ...);
-#else
-        0;  // There should be 0 Tensor indices if not vectorizing.
-#endif
-  };
-  
 #if defined(RAJA_ENABLE_VECTORIZATION)
-  /*
-   * Returns which argument has a vector index
-   */
-  template<camp::idx_t DIM, typename ... ARGS>
-  struct GetTensorArgIdx{
-      static constexpr camp::idx_t value =
-          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
-  };
-
-  template<camp::idx_t DIM, typename ... ARGS>
-  struct GetTensorArgIdx<DIM,camp::list<ARGS...>>{
-      static constexpr camp::idx_t value =
-          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
-  };
+/*
+ * Specialization for Tensor return types
+ */
+template <camp::idx_t VecHead,
+          camp::idx_t... VecSeq,
+          typename... Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType> {
+
+  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+  // This is the stride-one dimensions w.r.t. the tensor not the View
+  // For example:
+  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+  //  For a matrix, s_stride_one_dim is either:
+  //                 -1 neither row nor column are packed
+  //                 0 rows are stride-one
+  //                 1 columns are stride-one
+  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
+      (GetTensorArgIdx<VecHead, Args...>::value == LayoutType::stride_one_dim
+           ? VecHead
+           : -1),
+      (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim
+           ? VecSeq
+           : -1)...);
+
+
+  using tensor_reg_type =
+      typename camp::at_v<camp::list<Args...>,
+                          GetTensorArgIdx<0, Args...>::value>::tensor_type;
+  using ref_type = internal::expt::TensorRef<ElementType *,
+                                             LinIdx,
+                                             internal::expt::TENSOR_MULTIPLE,
+                                             s_num_dims,
+                                             s_stride_one_dim>;
+  using return_type =
+      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
-  /*
-   * Returns the beginning index in a vector argument
-   */
-  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr camp::idx_t get_tensor_args_begin(LAYOUT const &layout, ARGS ... args){
-    return RAJA::max<camp::idx_t>(
-        internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorBegin<ARGS>(args, layout.template get_dim_begin<GetTensorArgIdx<DIM, ARGS...>::value>())
-        : 0 ...);
+  static constexpr return_type make_return(LayoutType const &layout,
+                                           PointerType const &data,
+                                           Args const &...args)
+  {
+
+    return return_type(ref_type{
+        // data pointer
+        &data[0] +
+            layout(internal::expt::isTensorIndex<Args>()
+                       ? LinIdx{0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
+        // strides
+        {(LinIdx)layout.template get_dim_stride<
+             GetTensorArgIdx<VecHead, Args...>::value>(),
+         (LinIdx)layout.template get_dim_stride<
+             GetTensorArgIdx<VecSeq, Args...>::value>()...},
+        // tile
+        {// begin
+         {(LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
+          (LinIdx)(get_tensor_args_begin<VecSeq>(layout, args...))...},
+
+         // size
+         {(LinIdx)get_tensor_args_size<VecHead>(layout, args...),
+          (LinIdx)get_tensor_args_size<VecSeq>(layout, args...)...}}});
   }
+};
+
+
+/*
+ * Specialization for Tensor return types and static layout types
+ */
+template <camp::idx_t VecHead,
+          camp::idx_t... VecSeq,
+          typename... INDEX_TYPES,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          LinIdx... RangeInts,
+          LinIdx... SizeInts,
+          LinIdx... StrideInts,
+          typename DIM_LIST>
+struct ViewReturnHelper<
+    camp::idx_seq<VecHead, VecSeq...>,
+    camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    RAJA::detail::StaticLayoutBase_impl<LinIdx,
+                                        camp::int_seq<LinIdx, RangeInts...>,
+                                        camp::int_seq<LinIdx, SizeInts...>,
+                                        camp::int_seq<LinIdx, StrideInts...>,
+                                        DIM_LIST>> {
+  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+  using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
+
+  using range_seq = camp::int_seq<LinIdx, RangeInts...>;
+  using size_seq = camp::int_seq<LinIdx, SizeInts...>;
+  using stride_seq = camp::int_seq<LinIdx, StrideInts...>;
+  using LayoutType = RAJA::detail::
+      StaticLayoutBase_impl<LinIdx, range_seq, size_seq, stride_seq, DIM_LIST>;
+
+  // This is the stride-one dimensions w.r.t. the tensor not the View
+  // For example:
+  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+  //  For a matrix, s_stride_one_dim is either:
+  //                 -1 neither row nor column are packed
+  //                 0 rows are stride-one
+  //                 1 columns are stride-one
+  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
+      (GetTensorArgIdx<VecHead, index_list>::value == LayoutType::stride_one_dim
+           ? VecHead
+           : -1),
+      (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim
+           ? VecSeq
+           : -1)...);
+
+
+  using new_begin_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_begin<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_begin<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+  using new_size_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_size<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_size<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+
+  using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
+  using new_size_type = internal::expt::StaticIndexArray<new_size_seq>;
+
+
+  using tensor_reg_type =
+      typename camp::at_v<index_list,
+                          GetTensorArgIdx<0, index_list>::value>::tensor_type;
+  using ref_type =
+      internal::expt::StaticTensorRef<ElementType *,
+                                      LinIdx,
+                                      internal::expt::TENSOR_MULTIPLE,
+                                      stride_seq,
+                                      new_begin_seq,
+                                      new_size_seq,
+                                      s_stride_one_dim>;
+  using return_type =
+      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
+
 
-  /*
-   * Returns the number of elements in the vector argument
-   */
-  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr camp::idx_t get_tensor_args_size(LAYOUT const &layout, ARGS ... args){
-    return RAJA::max<camp::idx_t>(
-        internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorSize<ARGS>(args, layout.template get_dim_size<GetTensorArgIdx<DIM, ARGS...>::value>())
-        : 0 ...);
+  static constexpr return_type make_return(
+      LayoutType const &layout,
+      PointerType const &data,
+      RAJA::expt::StaticTensorIndex<INDEX_TYPES> const &...args)
+  {
+
+    return return_type(ref_type{
+        // data pointer
+        &data[0] +
+            layout(internal::expt::isTensorIndex<
+                       typename RAJA::expt::StaticTensorIndex<
+                           INDEX_TYPES>::base_type>()
+                       ? LinIdx{0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
+        // strides
+        typename ref_type::stride_type(),
+        // tile
+        {new_begin_type(), new_size_type()}});
   }
+};
 #endif
 
 
-  namespace detail {
+}  // namespace detail
 
-  /*!
-   * Provides conversion of view data to a return type.
-   *
-   * For scalars, this just returns the scalar.
-   *
-   * In the future development, this may return SIMD vectors or matrices using
-   * class specializations.
-   */
-  template<typename VecSeq, typename Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper;
 
+/*
+ * Computes the return type of a view.
+ *
+ * If any of the arguments are a VectorIndex, it creates a VectorRef
+ * return type.
+ *
+ * Otherwise it produces the usual scalar reference return type
+ */
+template <typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType,
+          typename... Args>
+using view_return_type_t = typename detail::ViewReturnHelper<
+    camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+    camp::list<Args...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    LayoutType>::return_type;
+
+/*
+ * Creates the return value for a View
+ *
+ * If any of the arguments are a VectorIndex, it creates a VectorRef
+ * return value.
+ *
+ * Otherwise it produces the usual scalar reference return value
+ */
+template <typename ElementType,
+          typename LinIdx,
+          typename LayoutType,
+          typename PointerType,
+          typename... Args>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<ElementType,
+                                                          PointerType,
+                                                          LinIdx,
+                                                          LayoutType,
+                                                          Args...>
+view_make_return_value(LayoutType const &layout,
+                       PointerType const &data,
+                       Args const &...args)
+{
+  return detail::ViewReturnHelper<
+      camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+      camp::list<Args...>,
+      ElementType,
+      PointerType,
+      LinIdx,
+      LayoutType>::make_return(layout, data, args...);
+}
 
-  /*
-   * Specialization for Scalar return types
-   */
-  template<typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper<camp::idx_seq<>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
-  {
-      using return_type = ElementType &;
+namespace detail
+{
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
-        return data[stripIndexType(layout(args...))];
-      }
-  };
+/**
+ * This class will help strip strongly typed indices
+ *
+ * This default implementation static_asserts that Expected==Arg, otherwise
+ * it's an error.  This enforces types for the TypedView.
+ *
+ * Specialization where expected type is same as argument type.
+ * In this case, there is no VectorIndex to unpack, just strip any strongly
+ * typed indices.
+ */
+template <typename Expected, typename Arg>
+struct MatchTypedViewArgHelper {
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
+  using type = strip_index_type_t<Arg>;
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  /*
-   * Specialization for Tensor return types
-   */
-  template<camp::idx_t VecHead, camp::idx_t ... VecSeq, typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper<camp::idx_seq<VecHead,VecSeq...>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
+  static RAJA_HOST_DEVICE RAJA_INLINE constexpr type extract(Arg arg)
   {
+    return stripIndexType(arg);
+  }
+};
 
-      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-      // This is the stride-one dimensions w.r.t. the tensor not the View
-      // For example:
-      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-      //  For a matrix, s_stride_one_dim is either:
-      //                 -1 neither row nor column are packed
-      //                 0 rows are stride-one
-      //                 1 columns are stride-one
-      static constexpr camp::idx_t s_stride_one_dim =
-          RAJA::max<camp::idx_t>(
-                  (GetTensorArgIdx<VecHead,Args...>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
-                  (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
-          );
-
-
-      using tensor_reg_type = typename camp::at_v<camp::list<Args...>, GetTensorArgIdx<0, Args...>::value>::tensor_type;
-      using ref_type = internal::expt::TensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE, s_num_dims, s_stride_one_dim>;
-      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
-
-        return return_type(ref_type{
-          // data pointer
-          &data[0] + layout(internal::expt::isTensorIndex<Args>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
-          // strides
-          {
-              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecHead,Args...>::value>(),
-              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecSeq, Args...>::value>()...
-          },
-          // tile
-          {
-              // begin
-              {
-                  (LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
-                  (LinIdx)(get_tensor_args_begin<VecSeq> (layout, args...))...
-              },
-
-              // size
-              {
-                  (LinIdx)get_tensor_args_size<VecHead>(layout, args...),
-                  (LinIdx)get_tensor_args_size<VecSeq> (layout, args...)...
-              }
-          }
-        });
-      }
-  };
 
+#if defined(RAJA_ENABLE_VECTORIZATION)
+/**
+ * Specialization where expected type is wrapped in a VectorIndex type
+ *
+ * In this case, there is no VectorIndex to unpack, just strip any strongly
+ * typed indices.
+ */
+template <typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
+struct MatchTypedViewArgHelper<Expected,
+                               RAJA::expt::TensorIndex<Arg, VectorType, DIM>> {
 
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
+  using arg_type = strip_index_type_t<Arg>;
 
+  using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
 
-  /*
-   * Specialization for Tensor return types and static layout types
-   */
-  template<
-      camp::idx_t VecHead, camp::idx_t ... VecSeq,
-      typename ... INDEX_TYPES,
-      typename ElementType, typename PointerType, typename LinIdx,
-      LinIdx... RangeInts, LinIdx... SizeInts, LinIdx... StrideInts,
-      typename DIM_LIST
-  >
-  struct ViewReturnHelper<
-      camp::idx_seq<VecHead,VecSeq...>,
-      camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
-      ElementType, PointerType,
-      LinIdx,
-      RAJA::detail::StaticLayoutBase_impl<
-          LinIdx,
-          camp::int_seq<LinIdx,RangeInts...>,
-          camp::int_seq<LinIdx,SizeInts...>,
-          camp::int_seq<LinIdx,StrideInts...>,
-          DIM_LIST
-      >
-  > {
-      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-      using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
-
-      using range_seq  = camp::int_seq<LinIdx,RangeInts... >;
-      using size_seq   = camp::int_seq<LinIdx,SizeInts...  >;
-      using stride_seq = camp::int_seq<LinIdx,StrideInts...>;
-      using LayoutType = RAJA::detail::StaticLayoutBase_impl<LinIdx,range_seq,size_seq,stride_seq,DIM_LIST>;
-
-      // This is the stride-one dimensions w.r.t. the tensor not the View
-      // For example:
-      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-      //  For a matrix, s_stride_one_dim is either:
-      //                 -1 neither row nor column are packed
-      //                 0 rows are stride-one
-      //                 1 columns are stride-one
-      static constexpr camp::idx_t s_stride_one_dim =
-          RAJA::max<camp::idx_t>(
-                  (GetTensorArgIdx<VecHead,index_list>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
-                  (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
-          );
-
-
-
-
-      using new_begin_seq = camp::int_seq<
-                LinIdx,
-                (LinIdx)get_tensor_args_begin<VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                (LinIdx)get_tensor_args_begin<VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
-            >;
-      using new_size_seq  = camp::int_seq<
-                LinIdx,
-                (LinIdx)get_tensor_args_size <VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                (LinIdx)get_tensor_args_size <VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
-            >;
-
-      using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
-      using new_size_type  = internal::expt::StaticIndexArray<new_size_seq >;
-
-
-      using tensor_reg_type = typename camp::at_v<index_list, GetTensorArgIdx<0, index_list>::value>::tensor_type;
-      using ref_type = internal::expt::StaticTensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE,stride_seq,new_begin_seq,new_size_seq, s_stride_one_dim>;
-      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, RAJA::expt::StaticTensorIndex<INDEX_TYPES> const &... args){
-
-        return return_type(ref_type{
-          // data pointer
-          &data[0] + layout(internal::expt::isTensorIndex<typename RAJA::expt::StaticTensorIndex<INDEX_TYPES>::base_type>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
-          // strides
-          typename ref_type::stride_type(),
-          // tile
-          {
-              new_begin_type(),
-              new_size_type()
-          }
-        });
-      }
-  };
+  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type
+  extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg)
+  {
+    return type(stripIndexType(*vec_arg), vec_arg.size());
+  }
+};
+
+/**
+ * Specialization where expected type is wrapped in a StaticTensorIndex type
+ *
+ * In this case, there is no StaticTensorIndex to unpack, just strip any
+ * strongly typed indices.
+ */
+template <typename Expected,
+          typename Arg,
+          typename VectorType,
+          camp::idx_t DIM,
+          Arg BEGIN,
+          strip_index_type_t<Arg> LENGTH>
+struct MatchTypedViewArgHelper<
+    Expected,
+    RAJA::expt::StaticTensorIndex<
+        RAJA::expt::
+            StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>> {
+
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
+
+  using arg_type = strip_index_type_t<Arg>;
+
+  using type = RAJA::expt::StaticTensorIndex<
+      RAJA::expt::
+          StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
+
+  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type extract(
+      RAJA::expt::StaticTensorIndex<
+          RAJA::expt::
+              StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>
+          RAJA_UNUSED_ARG(vec_arg))
+  {
+    return type();
+  }
+};
 #endif
 
+}  // namespace detail
 
-  } // namespace detail
 
+template <typename Expected, typename Arg>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr
+    typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
+    match_typed_view_arg(Arg const &arg)
+{
+  return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
+}
 
-  /*
-   * Computes the return type of a view.
-   *
-   * If any of the arguments are a VectorIndex, it creates a VectorRef
-   * return type.
-   *
-   * Otherwise it produces the usual scalar reference return type
-   */
-  template<typename ElementType, typename PointerType, typename LinIdx, typename LayoutType, typename ... Args>
-  using view_return_type_t =
-      typename detail::ViewReturnHelper<
-        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-        camp::list<Args...>,
-        ElementType,
-        PointerType,
-        LinIdx,
-        LayoutType>::return_type;
 
+template <typename ValueType, typename PointerType, typename LayoutType>
+class ViewBase
+{
+
+public:
+  using value_type = ValueType;
+  using pointer_type = PointerType;
+  using layout_type = LayoutType;
+  using linear_index_type = typename layout_type::IndexLinear;
+  using nc_value_type = typename std::remove_const<value_type>::type;
+  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+      typename std::remove_pointer<pointer_type>::type>::type>::type;
+
+  using Self = ViewBase<value_type, pointer_type, layout_type>;
+  using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
+
+  using shifted_layout_type = typename add_offset<layout_type>::type;
+  using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
+
+protected:
+  pointer_type m_data;
+  layout_type const m_layout;
+
+public:
   /*
-   * Creates the return value for a View
+   * Defaulted operators (AJK):
    *
-   * If any of the arguments are a VectorIndex, it creates a VectorRef
-   * return value.
+   * OpenMP Target currently needs the View classes to be trivially copyable,
+   * which means that we need to use the default ctor's and assignment
+   * operators.
    *
-   * Otherwise it produces the usual scalar reference return value
+   * These defaulted operators cause issues with some versions of CUDA, so
+   * in the case that CUDA is enabled, we switch to explicitly defined
+   * operators.
    */
-  template<typename ElementType, typename LinIdx, typename LayoutType, typename PointerType, typename ... Args>
+#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
+  RAJA_HOST_DEVICE
   RAJA_INLINE
+  constexpr ViewBase(){};
+
   RAJA_HOST_DEVICE
-  constexpr
-  view_return_type_t<ElementType, PointerType, LinIdx, LayoutType, Args...>
-  view_make_return_value(LayoutType const &layout, PointerType const &data, Args const &... args){
-    return detail::ViewReturnHelper<
-        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-        camp::list<Args...>,
-        ElementType,
-        PointerType,
-        LinIdx,
-        LayoutType>::make_return(layout, data, args...);
+  RAJA_INLINE ViewBase(ViewBase const &c)
+      : m_layout(c.m_layout), m_data(c.m_data)
+  {
   }
 
-  namespace detail
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ViewBase &operator=(ViewBase const &c)
   {
+    m_layout = c.m_layout;
+    m_data = c.m_data;
+  }
+#else
+  constexpr ViewBase() = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase const &) = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase &&) = default;
+  RAJA_INLINE ViewBase &operator=(ViewBase const &) = default;
+  RAJA_INLINE ViewBase &operator=(ViewBase &&) = default;
 
-  /**
-   * This class will help strip strongly typed indices
-   *
-   * This default implementation static_asserts that Expected==Arg, otherwise
-   * it's an error.  This enforces types for the TypedView.
-   *
-   * Specialization where expected type is same as argument type.
-   * In this case, there is no VectorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg>
-  struct MatchTypedViewArgHelper{
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
-
-    using type = strip_index_type_t<Arg>;
-
-    static RAJA_HOST_DEVICE RAJA_INLINE
-    constexpr
-    type extract(Arg arg){
-      return stripIndexType(arg);
-    }
-  };
-
+#endif
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  /**
-   * Specialization where expected type is wrapped in a VectorIndex type
-   *
-   * In this case, there is no VectorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
-  struct MatchTypedViewArgHelper<Expected, RAJA::expt::TensorIndex<Arg, VectorType, DIM> >{
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr ViewBase(pointer_type data, layout_type &&layout)
+      : m_data(data), m_layout(layout)
+  {
+  }
 
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(pointer_type data,
+                                                  Args... dim_sizes)
+      : m_data(data), m_layout(dim_sizes...)
+  {
+  }
 
-    using arg_type = strip_index_type_t<Arg>;
 
-    using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
+  template <bool IsConstView = std::is_const<value_type>::value>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(
+      typename std::enable_if<IsConstView, NonConstView>::type const &rhs)
+      : m_data(rhs.get_data()), m_layout(rhs.get_layout())
+  {
+  }
 
-    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    type extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg){
-      return type(stripIndexType(*vec_arg), vec_arg.size());
-    }
-  };
 
-  /**
-   * Specialization where expected type is wrapped in a StaticTensorIndex type
-   *
-   * In this case, there is no StaticTensorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM, Arg BEGIN, strip_index_type_t<Arg> LENGTH>
-  struct MatchTypedViewArgHelper<Expected, RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> >{
+  RAJA_HOST_DEVICE
+  RAJA_INLINE void set_data(PointerType data_ptr) { m_data = data_ptr; }
 
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr pointer_type const &get_data() const { return m_data; }
 
-    using arg_type = strip_index_type_t<Arg>;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr layout_type const &get_layout() const { return m_layout; }
 
-    using type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr linear_index_type size() const { return m_layout.size(); }
 
-    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    type extract(RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> RAJA_UNUSED_ARG(vec_arg)){
-      return type();
-    }
-  };
-#endif
 
-  } //namespace detail
+  template <camp::idx_t DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr linear_index_type get_dim_size() const
+  {
+    return m_layout.template get_dim_size<DIM>();
+  }
 
 
-  template<typename Expected, typename Arg>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
-  match_typed_view_arg(Arg const &arg)
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator()(Args... args) const
   {
-    return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
+    return view_make_return_value<value_type, linear_index_type>(m_layout,
+                                                                 m_data,
+                                                                 args...);
   }
 
 
+  /*
+   * Compatibility note (AJK):
+   * We are using variadic arguments even though operator[] takes exactly 1
+   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
+   * which seems to have been fixed in CUDA 9.2+
+   */
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator[](Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(m_layout,
+                                                                 m_data,
+                                                                 args...);
+  }
 
-template <typename ValueType,
-          typename PointerType,
-          typename LayoutType>
-class ViewBase {
-
-  public:
-    using value_type = ValueType;
-    using pointer_type = PointerType;
-    using layout_type = LayoutType;
-    using linear_index_type = typename layout_type::IndexLinear;
-    using nc_value_type = typename std::remove_const<value_type>::type;
-    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-        typename std::remove_pointer<pointer_type>::type>::type>::type;
-
-    using Self = ViewBase<value_type, pointer_type, layout_type>;
-    using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
-
-    using shifted_layout_type = typename add_offset<layout_type>::type;
-    using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
-
-  protected:
-    pointer_type m_data;
-    layout_type const m_layout;
-
-  public:
-
-
-    /*
-     * Defaulted operators (AJK):
-     *
-     * OpenMP Target currently needs the View classes to be trivially copyable,
-     * which means that we need to use the default ctor's and assignment
-     * operators.
-     *
-     * These defaulted operators cause issues with some versions of CUDA, so
-     * in the case that CUDA is enabled, we switch to explicitly defined
-     * operators.
-     */
-#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr ViewBase(){};
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE ViewBase(ViewBase const &c)
-      : m_layout(c.m_layout), m_data(c.m_data)
-    {
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    ViewBase &operator=(ViewBase const &c)
-    {
-      m_layout = c.m_layout;
-      m_data = c.m_data;
-    }
-#else
-    constexpr ViewBase() = default;
-    RAJA_INLINE constexpr ViewBase(ViewBase const &) = default;
-    RAJA_INLINE constexpr ViewBase(ViewBase &&) = default;
-    RAJA_INLINE ViewBase& operator=(ViewBase const &) = default;
-    RAJA_INLINE ViewBase& operator=(ViewBase &&) = default;
 
-#endif
+  template <size_t n_dims = layout_type::n_dims,
+            typename IdxLin = linear_index_type>
+  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims> &shift)
+  {
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(pointer_type data, layout_type &&layout) :
-    m_data(data), m_layout(layout)
-    {
-    }
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(pointer_type data, Args... dim_sizes) :
-    m_data(data), m_layout(dim_sizes...)
-    {
-    }
-
-
-    template <bool IsConstView = std::is_const<value_type>::value>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(typename std::enable_if<IsConstView, NonConstView>::type const &rhs) :
-    m_data(rhs.get_data()), m_layout(rhs.get_layout())
-    {
-    }
-
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE void set_data(PointerType data_ptr){
-      m_data = data_ptr;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    pointer_type const &get_data() const
-    {
-      return m_data;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    layout_type const &get_layout() const
-    {
-      return m_layout;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    linear_index_type size() const
-    {
-      return m_layout.size();
-    }
-
-
-    template<camp::idx_t DIM>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    linear_index_type get_dim_size() const
-    {
-      return m_layout.template get_dim_size<DIM>();
-    }
-
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator()(Args... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
-    }
-
-
-
-    /*
-     * Compatibility note (AJK):
-     * We are using variadic arguments even though operator[] takes exactly 1 argument
-     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
-     * been fixed in CUDA 9.2+
-     */
-    template <typename ... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator[](Args ... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
-    }
-
-
-
-    template <size_t n_dims = layout_type::n_dims, typename IdxLin = linear_index_type>
-    RAJA_INLINE
-    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-    {
-      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
-
-      shifted_layout_type shift_layout(m_layout);
-      shift_layout.shift(shift);
-
-      return ShiftedView(m_data, shift_layout);
-    }
+    shifted_layout_type shift_layout(m_layout);
+    shift_layout.shift(shift);
 
+    return ShiftedView(m_data, shift_layout);
+  }
 };
 
 
 template <typename ValueType,
-        typename PointerType,
-        typename LayoutType,
-        typename IndexTypes>
+          typename PointerType,
+          typename LayoutType,
+          typename IndexTypes>
 class TypedViewBase;
 
 template <typename ValueType,
           typename PointerType,
           typename LayoutType,
           typename... IndexTypes>
-class TypedViewBase<ValueType, PointerType, LayoutType, camp::list<IndexTypes...>> :
-  public ViewBase<ValueType, PointerType, LayoutType>
+class TypedViewBase<ValueType,
+                    PointerType,
+                    LayoutType,
+                    camp::list<IndexTypes...>>
+    : public ViewBase<ValueType, PointerType, LayoutType>
 {
 
-  public:
-    using value_type = ValueType;
-    using pointer_type = PointerType;
-    using layout_type = LayoutType;
-    using linear_index_type = typename layout_type::IndexLinear;
-    using nc_value_type = typename std::remove_const<value_type>::type;
-    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-        typename std::remove_pointer<pointer_type>::type>::type>::type;
-
-    using Base = ViewBase<ValueType, PointerType, LayoutType>;
-    using Self = TypedViewBase<value_type, pointer_type, layout_type, camp::list<IndexTypes...> >;
-    using NonConstView = TypedViewBase<nc_value_type, nc_pointer_type, layout_type, camp::list<IndexTypes...> >;
-
-    using shifted_layout_type = typename add_offset<layout_type>::type;
-    using ShiftedView = TypedViewBase<value_type, pointer_type, shifted_layout_type, camp::list<IndexTypes...> >;
-
-    static constexpr size_t n_dims = sizeof...(IndexTypes);
-
-    using Base::Base;
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator()(Args... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
-    }
-
-
-
-    /*
-     * Compatibility note (AJK):
-     * We are using variadic arguments even though operator[] takes exactly 1 argument
-     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
-     * been fixed in CUDA 9.2+
-     */
-    template <typename ... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator[](Args ... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
-    }
+public:
+  using value_type = ValueType;
+  using pointer_type = PointerType;
+  using layout_type = LayoutType;
+  using linear_index_type = typename layout_type::IndexLinear;
+  using nc_value_type = typename std::remove_const<value_type>::type;
+  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+      typename std::remove_pointer<pointer_type>::type>::type>::type;
+
+  using Base = ViewBase<ValueType, PointerType, LayoutType>;
+  using Self = TypedViewBase<value_type,
+                             pointer_type,
+                             layout_type,
+                             camp::list<IndexTypes...>>;
+  using NonConstView = TypedViewBase<nc_value_type,
+                                     nc_pointer_type,
+                                     layout_type,
+                                     camp::list<IndexTypes...>>;
+
+  using shifted_layout_type = typename add_offset<layout_type>::type;
+  using ShiftedView = TypedViewBase<value_type,
+                                    pointer_type,
+                                    shifted_layout_type,
+                                    camp::list<IndexTypes...>>;
+
+  static constexpr size_t n_dims = sizeof...(IndexTypes);
+
+  using Base::Base;
+
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator()(Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        Base::m_layout,
+        Base::m_data,
+        match_typed_view_arg<IndexTypes>(args)...);
+  }
 
 
+  /*
+   * Compatibility note (AJK):
+   * We are using variadic arguments even though operator[] takes exactly 1
+   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
+   * which seems to have been fixed in CUDA 9.2+
+   */
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator[](Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        Base::m_layout,
+        Base::m_data,
+        match_typed_view_arg<IndexTypes>(args)...);
+  }
 
-    template <size_t n_dims = sizeof...(IndexTypes), typename IdxLin = linear_index_type>
-    RAJA_INLINE
-    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-    {
-      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
 
-      shifted_layout_type shift_layout(Base::get_layout());
-      shift_layout.shift(shift);
+  template <size_t n_dims = sizeof...(IndexTypes),
+            typename IdxLin = linear_index_type>
+  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims> &shift)
+  {
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
-      return ShiftedView(Base::get_data(), shift_layout);
-    }
+    shifted_layout_type shift_layout(Base::get_layout());
+    shift_layout.shift(shift);
 
+    return ShiftedView(Base::get_data(), shift_layout);
+  }
 };
 
 
-
-} // namespace internal
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index fcaee67f98..e7c928d2f6 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -21,9 +21,7 @@
 #include <type_traits>
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/pattern/atomic.hpp"
-
 #include "RAJA/util/IndexLayout.hpp"
 #include "RAJA/util/Layout.hpp"
 #include "RAJA/util/OffsetLayout.hpp"
@@ -32,66 +30,60 @@
 namespace RAJA
 {
 
-//Helpers to convert
-//layouts -> OffsetLayouts
-//Typedlayouts -> TypedOffsetLayouts
-template<typename layout>
-struct add_offset
-{
+// Helpers to convert
+// layouts -> OffsetLayouts
+// Typedlayouts -> TypedOffsetLayouts
+template <typename layout>
+struct add_offset {
   using type = RAJA::OffsetLayout<layout::n_dims>;
 };
 
-template<typename IdxLin, typename...DimTypes>
-struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
-{
-  using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
+template <typename IdxLin, typename... DimTypes>
+struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>> {
+  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
 };
 
 template <typename ValueType,
           typename LayoutType,
           typename PointerType = ValueType *>
-using View =
-    internal::ViewBase<ValueType, PointerType, LayoutType>;
-
+using View = internal::ViewBase<ValueType, PointerType, LayoutType>;
 
 
 template <typename ValueType, typename LayoutType, typename... IndexTypes>
-using TypedView =
-    internal::TypedViewBase<ValueType, ValueType *, LayoutType, camp::list<IndexTypes...> >;
-
-
-
+using TypedView = internal::TypedViewBase<ValueType,
+                                          ValueType *,
+                                          LayoutType,
+                                          camp::list<IndexTypes...>>;
 
 
 template <typename IndexType, typename ValueType>
-RAJA_INLINE View<ValueType, Layout<1, IndexType, 0> > make_view(
-    ValueType *ptr)
+RAJA_INLINE View<ValueType, Layout<1, IndexType, 0>> make_view(ValueType *ptr)
 {
-  return View<ValueType, Layout<1, IndexType, 0> >(ptr, 1);
+  return View<ValueType, Layout<1, IndexType, 0>>(ptr, 1);
 }
 
-template <size_t n_dims, typename IndexType, typename ValueType, typename... IndexTypes>
-RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> > make_index_view(
-    ValueType *ptr, IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
+template <size_t n_dims,
+          typename IndexType,
+          typename ValueType,
+          typename... IndexTypes>
+RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>
+make_index_view(ValueType *ptr,
+                IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
 {
-  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> >(ptr, index_layout);
+  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>(
+      ptr, index_layout);
 }
 
 
 // select certain indices from a tuple, given a curated index sequence
 // returns linear index of layout(ar...)
 template <typename Lay, typename Tup, camp::idx_t... Idxs>
-RAJA_HOST_DEVICE RAJA_INLINE 
-auto selecttuple( Lay lyout, Tup&& tup, camp::idx_seq<Idxs...> ) ->
-  decltype(
-            lyout(
-              camp::get<Idxs>(std::forward<Tup>(tup))...
-            )
-          )
-{ 
-  return lyout(
-                camp::get<Idxs>(std::forward<Tup>(tup))...
-              );
+RAJA_HOST_DEVICE RAJA_INLINE auto selecttuple(Lay lyout,
+                                              Tup &&tup,
+                                              camp::idx_seq<Idxs...>)
+    -> decltype(lyout(camp::get<Idxs>(std::forward<Tup>(tup))...))
+{
+  return lyout(camp::get<Idxs>(std::forward<Tup>(tup))...);
 }
 
 // sequence combiner
@@ -99,10 +91,7 @@ template <typename Seq1, typename Seq2>
 struct cat_seq;
 
 template <camp::idx_t... Idxs1, camp::idx_t... Idxs2>
-struct cat_seq  < camp::idx_seq<Idxs1...>,
-                  camp::idx_seq<Idxs2...>
-                >
-{
+struct cat_seq<camp::idx_seq<Idxs1...>, camp::idx_seq<Idxs2...>> {
   using type = camp::idx_seq<Idxs1..., Idxs2...>;
 };
 
@@ -114,9 +103,8 @@ template <camp::idx_t Offset, typename Seq>
 struct offset_seq;
 
 template <camp::idx_t Offset, camp::idx_t... Idxs>
-struct offset_seq<Offset, camp::idx_seq<Idxs...>>
-{
-  using type = camp::idx_seq<(Idxs+Offset)...>;
+struct offset_seq<Offset, camp::idx_seq<Idxs...>> {
+  using type = camp::idx_seq<(Idxs + Offset)...>;
 };
 
 template <camp::idx_t Offset, typename Seq>
@@ -125,60 +113,50 @@ using offset_seq_t = typename offset_seq<Offset, Seq>::type;
 // remove the Nth index in a parameter pack
 // returns linear index of layout(ar...)
 template <typename Lay, RAJA::Index_type Nth = 0, typename Tup>
-RAJA_HOST_DEVICE RAJA_INLINE auto removenth( Lay lyout, Tup&& tup ) ->
-  decltype( selecttuple<Lay>(
-              lyout,
-              std::forward<Tup>(tup),
-              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-                          offset_seq_t<
-                            Nth+1,  // after Nth
-                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
-                          > // sequence after Nth
-                       >{}
-            )
-          )
+RAJA_HOST_DEVICE RAJA_INLINE auto removenth(Lay lyout, Tup &&tup)
+    -> decltype(selecttuple<Lay>(
+        lyout,
+        std::forward<Tup>(tup),
+        cat_seq_t<
+            camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+            offset_seq_t<Nth + 1,       // after Nth
+                         camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                              Nth - 1>>  // sequence after Nth
+            >{}))
 {
   return selecttuple<Lay>(
-              lyout,
-              std::forward<Tup>(tup),
-              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-                          offset_seq_t<
-                            Nth+1,  // after Nth
-                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
-                          > // sequence after Nth
-                       >{}
-          );
+      lyout,
+      std::forward<Tup>(tup),
+      cat_seq_t<camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+                offset_seq_t<Nth + 1,       // after Nth
+                             camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                                  Nth - 1>>  // sequence after
+                                                             // Nth
+                >{});
 }
 
 
-
-
-// P2Pidx represents the array-of-pointers index. This allows the position of the
-// index into the array-of-pointers to be moved around in the MultiView operator();
-// see the operator overload.
-// Default of 0 means that the p2p index is in the 0th position.
+// P2Pidx represents the array-of-pointers index. This allows the position of
+// the index into the array-of-pointers to be moved around in the MultiView
+// operator(); see the operator overload. Default of 0 means that the p2p index
+// is in the 0th position.
 template <typename ValueType,
           typename LayoutType,
           RAJA::Index_type P2Pidx = 0,
           typename PointerType = ValueType **,
-          typename NonConstPointerType =
-              camp::type::ptr::add< // adds *
-                camp::type::ptr::add<
-                  camp::type::cv::rem<  // removes cv
-                    camp::type::ptr::rem<
-                      camp::type::ptr::rem<PointerType>  // removes *
-                    >
-                  >
-                >
-              >
-          >
+          typename NonConstPointerType = camp::type::ptr::add<  // adds *
+              camp::type::ptr::add<camp::type::cv::rem<         // removes cv
+                  camp::type::ptr::rem<camp::type::ptr::rem<PointerType>  // removes
+                                                                          // *
+                                       >>>>>
 struct MultiView {
   using value_type = ValueType;
   using pointer_type = PointerType;
   using layout_type = LayoutType;
   using nc_value_type = camp::decay<value_type>;
   using nc_pointer_type = NonConstPointerType;
-  using NonConstView = MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
+  using NonConstView =
+      MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
 
   layout_type const layout;
   nc_pointer_type data;
@@ -196,29 +174,32 @@ struct MultiView {
 
   RAJA_INLINE constexpr MultiView(MultiView const &) = default;
   RAJA_INLINE constexpr MultiView(MultiView &&) = default;
-  RAJA_INLINE MultiView& operator=(MultiView const &) = default;
-  RAJA_INLINE MultiView& operator=(MultiView &&) = default;
+  RAJA_INLINE MultiView &operator=(MultiView const &) = default;
+  RAJA_INLINE MultiView &operator=(MultiView &&) = default;
 
   template <bool IsConstView = std::is_const<value_type>::value>
   RAJA_INLINE constexpr MultiView(
       typename std::enable_if<IsConstView, NonConstView>::type const &rhs)
-      : layout(rhs.layout),
-        data(rhs.data)
+      : layout(rhs.layout), data(rhs.data)
   {
   }
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { data = data_ptr; }
 
-  template <size_t n_dims=layout_type::n_dims, typename IdxLin = Index_type>
-  RAJA_INLINE RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
-  shift(const std::array<IdxLin, n_dims>& shift)
+  template <size_t n_dims = layout_type::n_dims, typename IdxLin = Index_type>
+  RAJA_INLINE RAJA::
+      MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
+      shift(const std::array<IdxLin, n_dims> &shift)
   {
-    static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
     typename add_offset<layout_type>::type shift_layout(layout);
     shift_layout.shift(shift);
 
-    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>(data, shift_layout);
+    return RAJA::MultiView<ValueType,
+                           typename add_offset<layout_type>::type,
+                           P2Pidx>(data, shift_layout);
   }
 
   // Moving the position of the index into the array-of-pointers
@@ -228,14 +209,16 @@ struct MultiView {
   template <typename... Args>
   RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(Args... ar) const
   {
-    auto pidx = stripIndexType( camp::get<P2Pidx>( camp::forward_as_tuple( ar... ) ) );
+    auto pidx =
+        stripIndexType(camp::get<P2Pidx>(camp::forward_as_tuple(ar...)));
 
-    if ( pidx < 0 )
-    {
-      RAJA_ABORT_OR_THROW( "Negative index while accessing array of pointers.\n" );
+    if (pidx < 0) {
+      RAJA_ABORT_OR_THROW(
+          "Negative index while accessing array of pointers.\n");
     }
-    
-    auto idx = stripIndexType( removenth<LayoutType, P2Pidx>( layout, camp::forward_as_tuple( ar... ) ) );
+
+    auto idx = stripIndexType(
+        removenth<LayoutType, P2Pidx>(layout, camp::forward_as_tuple(ar...)));
     return data[pidx][idx];
   }
 };
@@ -255,7 +238,7 @@ struct AtomicViewWrapper {
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
   template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS &&... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS &&...args) const
   {
     return atomic_type(&base_.operator()(std::forward<ARGS>(args)...));
   }
@@ -281,7 +264,7 @@ struct AtomicViewWrapper<ViewType, RAJA::seq_atomic> {
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
   template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(ARGS &&... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(ARGS &&...args) const
   {
     return base_.operator()(std::forward<ARGS>(args)...);
   }
diff --git a/include/RAJA/util/align.hpp b/include/RAJA/util/align.hpp
index 7103ecb152..b39e24c18a 100644
--- a/include/RAJA/util/align.hpp
+++ b/include/RAJA/util/align.hpp
@@ -32,13 +32,15 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
 {
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#pragma warning(disable : 4146)  // Force msvc to ignore subtracting from signed
+                                 // number warning
 #endif
   void* r = nullptr;
   if (size <= space) {
     char* p1 = static_cast<char*>(ptr);
     char* p2 = reinterpret_cast<char*>(
-        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) & -alignment);
+        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) &
+        -alignment);
     size_t d = static_cast<size_t>(p2 - p1);
     if (d <= space - size) {
       r = p2;
@@ -49,9 +51,9 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
   return r;
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#pragma warning(default : 4146)  // Force msvc to ignore subtracting from signed
+                                 // number warning
 #endif
-
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index f0208ccbd3..1bd7f8b7d7 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -60,11 +60,11 @@ class MemoryArena
   using used_value_type = typename used_type::value_type;
 
   MemoryArena(void* ptr, size_t size)
-    : m_allocation{ ptr, static_cast<char*>(ptr)+size },
-      m_free_space(),
-      m_used_space()
+      : m_allocation{ptr, static_cast<char*>(ptr) + size},
+        m_free_space(),
+        m_used_space()
   {
-     m_free_space[ptr] = static_cast<char*>(ptr)+size ;
+    m_free_space[ptr] = static_cast<char*>(ptr) + size;
     if (m_allocation.begin == nullptr) {
       fprintf(stderr, "Attempt to create MemoryArena with no memory");
       std::abort();
diff --git a/include/RAJA/util/camp_aliases.hpp b/include/RAJA/util/camp_aliases.hpp
index c747ac64a0..c334404b42 100644
--- a/include/RAJA/util/camp_aliases.hpp
+++ b/include/RAJA/util/camp_aliases.hpp
@@ -25,11 +25,10 @@
 
 #include "RAJA/config.hpp"
 #include "RAJA/util/macros.hpp"
-
 #include "camp/defines.hpp"
 #include "camp/list/list.hpp"
-#include "camp/tuple.hpp"
 #include "camp/resource.hpp"
+#include "camp/tuple.hpp"
 
 namespace RAJA
 {
diff --git a/include/RAJA/util/concepts.hpp b/include/RAJA/util/concepts.hpp
index 4372993949..4671fd4a12 100644
--- a/include/RAJA/util/concepts.hpp
+++ b/include/RAJA/util/concepts.hpp
@@ -34,17 +34,17 @@ using namespace camp::concepts;
 
 template <typename From, typename To>
 struct ConvertibleTo
-  : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>())) {
+    : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>())) {
 };
 
-}
+}  // namespace concepts
 
 namespace type_traits
 {
 using namespace camp::type_traits;
 
 DefineTypeTraitFromConcept(convertible_to, concepts::ConvertibleTo);
-}
+}  // namespace type_traits
 
 }  // end namespace RAJA
 
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index 25783b2a0a..5a8be1b5ee 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -18,17 +18,14 @@
 #ifndef RAJA_util_for_each_HPP
 #define RAJA_util_for_each_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iterator>
 #include <type_traits>
 
-#include "camp/list.hpp"
-
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/detail/algorithm.hpp"
-
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "camp/list.hpp"
 
 namespace RAJA
 {
@@ -38,9 +35,10 @@ namespace detail
 
 // runtime loop applying func to each element in the range in order
 RAJA_SUPPRESS_HD_WARN
-template<typename Iter, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
+template <typename Iter, typename UnaryFunc>
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
+                                                Iter end,
+                                                UnaryFunc func)
 {
   for (; begin != end; ++begin) {
     func(*begin);
@@ -52,8 +50,8 @@ UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
 // compile time expansion applying func to a each type in the list in order
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
+                                                     UnaryFunc func)
 {
   // braced init lists are evaluated in order
   int seq_unused_array[] = {0, (func(Ts{}), 0)...};
@@ -65,8 +63,9 @@ UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
 // compile time expansion applying func to a each type in the tuple in order
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc, camp::idx_t... Is>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
+                                                      UnaryFunc func,
+                                                      camp::idx_seq<Is...>)
 {
   using camp::get;
   // braced init lists are evaluated in order
@@ -87,7 +86,7 @@ UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
 RAJA_SUPPRESS_HD_WARN
 template <typename Container, typename UnaryFunc>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
+    concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
     for_each(Container&& c, UnaryFunc func)
 {
   using std::begin;
@@ -102,22 +101,23 @@ concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const& c,
+                                                     UnaryFunc func)
 {
   return detail::for_each_type(c, std::move(func));
 }
 
 /*!
-  \brief Apply func to each object in the given tuple or tuple like type in order
-  using a compile-time expansion in O(N) operations and O(1) extra memory
+  \brief Apply func to each object in the given tuple or tuple like type in
+  order using a compile-time expansion in O(N) operations and O(1) extra memory
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
 {
-  return detail::for_each_tuple(std::forward<Tuple>(t), std::move(func),
+  return detail::for_each_tuple(
+      std::forward<Tuple>(t),
+      std::move(func),
       camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value>{});
 }
 
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index 9ddb5bebb7..e4fbdd272b 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -18,11 +18,12 @@
 #ifndef RAJA_INTERNAL_MACROS_HPP
 #define RAJA_INTERNAL_MACROS_HPP
 
-#include "RAJA/config.hpp"
+#include <stdio.h>
 
 #include <cstdlib>
 #include <stdexcept>
-#include <stdio.h>
+
+#include "RAJA/config.hpp"
 
 #if defined(RAJA_HIP_ACTIVE)
 #include <hip/hip_runtime.h>
@@ -33,9 +34,9 @@
 // We need a better solution than this as it is a pain to manage
 // this stuff in an application.
 //
-#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) \
-  || (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) \
-  || (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
+#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) ||         \
+    (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \
+    (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
 #define RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
 #endif
 
@@ -142,10 +143,10 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
  */
 #if defined(RAJA_ENABLE_OPENMP)
 #define RAJA_OMP_DECLARE_REDUCTION_COMBINE \
-      _Pragma(" omp declare reduction( combine \
+  _Pragma(                                 \
+      " omp declare reduction( combine \
         : typename std::remove_reference<decltype(f_params)>::type \
-        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")\
-        //initializer(omp_priv = omp_in) ")
+        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")  // initializer(omp_priv = omp_in) ")
 #endif
 
 
@@ -153,15 +154,15 @@ RAJA_HOST_DEVICE
 inline void RAJA_ABORT_OR_THROW(const char *str)
 {
 #if defined(__SYCL_DEVICE_ONLY__)
-  //segfault here ran into linking problems
+  // segfault here ran into linking problems
   *((volatile char *)0) = 0;  // write to address 0
 #else
-  printf ( "%s\n", str );
+  printf("%s\n", str);
 #if defined(RAJA_ENABLE_TARGET_OPENMP) && (_OPENMP >= 201511)
   // seg faulting here instead of calling std::abort for omp target
   *((volatile char *)0) = 0;  // write to address 0
 #elif defined(__CUDA_ARCH__)
-  asm ("trap;");
+  asm("trap;");
 
 #elif defined(__HIP_DEVICE_COMPILE__)
   abort();
@@ -172,7 +173,7 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
   char *value;
   size_t len;
   bool no_except = false;
-  if(_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr){
+  if (_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr) {
     no_except = true;
     free(value);
   }
diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp
index 99d7bc192e..82c747c278 100644
--- a/include/RAJA/util/math.hpp
+++ b/include/RAJA/util/math.hpp
@@ -18,10 +18,10 @@
 #ifndef RAJA_util_math_HPP
 #define RAJA_util_math_HPP
 
-#include "RAJA/config.hpp"
-
-#include <type_traits>
 #include <climits>
+#include <type_traits>
+
+#include "RAJA/config.hpp"
 
 namespace RAJA
 {
@@ -34,14 +34,12 @@ namespace RAJA
     For zero or negative n return 0
 
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T log2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T log2(T n) noexcept
 {
   T result = 0;
   if (n > 0) {
-    while(n >>= 1) {
+    while (n >>= 1) {
       ++result;
     }
   }
@@ -57,13 +55,11 @@ constexpr T log2(T n) noexcept
         if n is not a power of 2, return the next greater power of 2
       if n is negative, return 0
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T next_pow2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T next_pow2(T n) noexcept
 {
   --n;
-  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2) {
     n |= n >> s;
   }
   ++n;
@@ -71,7 +67,8 @@ constexpr T next_pow2(T n) noexcept
 }
 
 /*!
-    \brief "round down" to the largest power of 2 that is less than or equal to n
+    \brief "round down" to the largest power of 2 that is less than or equal to
+   n
 
     For an integer n,
       if n is negative, return 0
@@ -79,13 +76,11 @@ constexpr T next_pow2(T n) noexcept
         if n is a power of 2, return n
         else return the largest power of 2 that is less than n
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T prev_pow2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T prev_pow2(T n) noexcept
 {
-  if ( n < 0 ) return 0;
-  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+  if (n < 0) return 0;
+  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2) {
     n |= n >> s;
   }
   return n - (n >> 1);
@@ -94,12 +89,14 @@ constexpr T prev_pow2(T n) noexcept
 /*!
     \brief compute lhs mod rhs where lhs is non-negative and rhs is a power of 2
 */
-template < typename L, typename R,
-           std::enable_if_t<std::is_integral<L>::value && std::is_integral<R>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr auto power_of_2_mod(L lhs, R rhs) noexcept
+template <typename L,
+          typename R,
+          std::enable_if_t<std::is_integral<L>::value &&
+                           std::is_integral<R>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto power_of_2_mod(L lhs,
+                                                           R rhs) noexcept
 {
-  return lhs & (rhs-R(1));
+  return lhs & (rhs - R(1));
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/plugins.hpp b/include/RAJA/util/plugins.hpp
index d5f42efde0..4bf6b2bed3 100644
--- a/include/RAJA/util/plugins.hpp
+++ b/include/RAJA/util/plugins.hpp
@@ -9,112 +9,90 @@
 #define RAJA_plugins_HPP
 
 #include "RAJA/config.hpp"
-
 #include "RAJA/util/PluginContext.hpp"
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 #if defined(RAJA_ENABLE_RUNTIME_PLUGINS)
-#include "RAJA/util/RuntimePluginLoader.hpp"
 #include "RAJA/util/KokkosPluginLoader.hpp"
+#include "RAJA/util/RuntimePluginLoader.hpp"
 #endif
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 template <typename T>
-RAJA_INLINE auto trigger_updates_before(T&& item)
-  -> typename std::remove_reference<T>::type
+RAJA_INLINE auto trigger_updates_before(T&& item) ->
+    typename std::remove_reference<T>::type
 {
   return item;
 }
 
 RAJA_INLINE
-void
-callPreCapturePlugins(const PluginContext& p)
+void callPreCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
-  {
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin) {
     (*plugin).get()->preCapture(p);
   }
 }
 
 RAJA_INLINE
-void
-callPostCapturePlugins(const PluginContext& p)
+void callPostCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
-  {
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin) {
     (*plugin).get()->postCapture(p);
   }
 }
 
 RAJA_INLINE
-void
-callPreLaunchPlugins(const PluginContext& p)
+void callPreLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
-  {
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin) {
     (*plugin).get()->preLaunch(p);
   }
 }
 
 RAJA_INLINE
-void
-callPostLaunchPlugins(const PluginContext& p)
+void callPostLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
-  {
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin) {
     (*plugin).get()->postLaunch(p);
   }
 }
 
 RAJA_INLINE
-void
-callInitPlugins(const PluginOptions p)
+void callInitPlugins(const PluginOptions p)
 {
-  for (auto plugin = PluginRegistry::begin(); 
-      plugin != PluginRegistry::end();
-      ++plugin)
-  {
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin) {
     (*plugin).get()->init(p);
   }
 }
 
 RAJA_INLINE
-void
-init_plugins(const std::string& path)
-{   
+void init_plugins(const std::string& path)
+{
   callInitPlugins(make_options(path));
 }
 
 RAJA_INLINE
-void
-init_plugins()
-{   
-  callInitPlugins(make_options(""));
-}
+void init_plugins() { callInitPlugins(make_options("")); }
 
 RAJA_INLINE
-void
-finalize_plugins()
-{   
-  for (auto plugin = PluginRegistry::begin(); 
-    plugin != PluginRegistry::end();
-    ++plugin)
-  {
+void finalize_plugins()
+{
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin) {
     (*plugin).get()->finalize();
   }
 }
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
index 6d0c28f861..bbdf567acf 100644
--- a/include/RAJA/util/reduce.hpp
+++ b/include/RAJA/util/reduce.hpp
@@ -18,19 +18,17 @@
 #ifndef RAJA_util_reduce_HPP
 #define RAJA_util_reduce_HPP
 
-#include "RAJA/config.hpp"
-
 #include <climits>
 #include <iterator>
 #include <new>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/detail/algorithm.hpp"
-
-#include "RAJA/util/macros.hpp"
+#include "RAJA/util/Operators.hpp"
 #include "RAJA/util/concepts.hpp"
+#include "RAJA/util/macros.hpp"
 #include "RAJA/util/math.hpp"
-#include "RAJA/util/Operators.hpp"
 
 namespace RAJA
 {
@@ -42,21 +40,18 @@ namespace detail
     \brief Reduce class that does a reduction with a left fold.
 */
 template <typename T, typename BinaryOp>
-struct LeftFoldReduce
-{
-  RAJA_HOST_DEVICE RAJA_INLINE
-  constexpr explicit LeftFoldReduce(T init = BinaryOp::identity(),
-                                      BinaryOp op = BinaryOp{}) noexcept
-    : m_op(std::move(op))
-    , m_accumulated_value(std::move(init))
+struct LeftFoldReduce {
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit LeftFoldReduce(
+      T init = BinaryOp::identity(),
+      BinaryOp op = BinaryOp{}) noexcept
+      : m_op(std::move(op)), m_accumulated_value(std::move(init))
   {
-
   }
 
   LeftFoldReduce(LeftFoldReduce const&) = delete;
   LeftFoldReduce& operator=(LeftFoldReduce const&) = delete;
-  LeftFoldReduce(LeftFoldReduce &&) = delete;
-  LeftFoldReduce& operator=(LeftFoldReduce &&) = delete;
+  LeftFoldReduce(LeftFoldReduce&&) = delete;
+  LeftFoldReduce& operator=(LeftFoldReduce&&) = delete;
 
   ~LeftFoldReduce() = default;
 
@@ -64,8 +59,7 @@ struct LeftFoldReduce
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
   {
     m_accumulated_value = BinaryOp::identity();
   }
@@ -73,8 +67,7 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
   {
     T accumulated_value = std::move(m_accumulated_value);
 
@@ -86,17 +79,12 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get()
-  {
-    return m_accumulated_value;
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE T get() { return m_accumulated_value; }
 
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void combine(T val)
+  RAJA_HOST_DEVICE RAJA_INLINE void combine(T val)
   {
     m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val));
   }
@@ -109,40 +97,38 @@ struct LeftFoldReduce
 /*!
     \brief Reduce class that does a reduction with a binary tree.
 */
-template <typename T, typename BinaryOp, typename SizeType = size_t,
-          SizeType t_num_levels = CHAR_BIT*sizeof(SizeType)>
-struct BinaryTreeReduce
-{
+template <typename T,
+          typename BinaryOp,
+          typename SizeType = size_t,
+          SizeType t_num_levels = CHAR_BIT * sizeof(SizeType)>
+struct BinaryTreeReduce {
   static_assert(std::is_unsigned<SizeType>::value, "SizeType must be unsigned");
-  static_assert(t_num_levels <= CHAR_BIT*sizeof(SizeType), "SizeType must be large enough to act at a bitset for num_levels");
+  static_assert(t_num_levels <= CHAR_BIT * sizeof(SizeType),
+                "SizeType must be large enough to act at a bitset for "
+                "num_levels");
 
   static constexpr SizeType num_levels = t_num_levels;
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  constexpr explicit BinaryTreeReduce(T init = BinaryOp::identity(),
-                                      BinaryOp op = BinaryOp{}) noexcept
-    : m_op(std::move(op))
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit BinaryTreeReduce(
+      T init = BinaryOp::identity(),
+      BinaryOp op = BinaryOp{}) noexcept
+      : m_op(std::move(op))
   {
     combine(std::move(init));
   }
 
   BinaryTreeReduce(BinaryTreeReduce const&) = delete;
   BinaryTreeReduce& operator=(BinaryTreeReduce const&) = delete;
-  BinaryTreeReduce(BinaryTreeReduce &&) = delete;
-  BinaryTreeReduce& operator=(BinaryTreeReduce &&) = delete;
+  BinaryTreeReduce(BinaryTreeReduce&&) = delete;
+  BinaryTreeReduce& operator=(BinaryTreeReduce&&) = delete;
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  ~BinaryTreeReduce()
-  {
-    clear();
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE ~BinaryTreeReduce() { clear(); }
 
 
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
   {
     // destroy all values on the tree stack and reset count to 0
     for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
@@ -152,7 +138,6 @@ struct BinaryTreeReduce
         get_value(level)->~T();
 
         m_count ^= mask;
-
       }
     }
   }
@@ -160,8 +145,7 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
   {
     // accumulate all values
     T value = BinaryOp::identity();
@@ -183,13 +167,13 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get()
+  RAJA_HOST_DEVICE RAJA_INLINE T get()
   {
     // accumulate all values
     T value = BinaryOp::identity();
 
-    for (SizeType count = m_count, level = 0, mask = 1; count; ++level, mask <<= 1) {
+    for (SizeType count = m_count, level = 0, mask = 1; count;
+         ++level, mask <<= 1) {
 
       if (count & mask) {
 
@@ -205,8 +189,7 @@ struct BinaryTreeReduce
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void combine(T value)
+  RAJA_HOST_DEVICE RAJA_INLINE void combine(T value)
   {
     // accumulate values and store in the first unused level found
     // clear values from used levels along the way
@@ -215,10 +198,9 @@ struct BinaryTreeReduce
 
       value = m_op(std::move(*get_value(level)), std::move(value));
       get_value(level)->~T();
-
     }
 
-    new(get_storage(level)) T(std::move(value));
+    new (get_storage(level)) T(std::move(value));
 
     ++m_count;
   }
@@ -234,14 +216,12 @@ struct BinaryTreeReduce
   // values or is unused and has no value.
   std::aligned_storage_t<sizeof(T), alignof(T)> m_tree_stack[num_levels];
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void* get_storage(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE void* get_storage(SizeType level)
   {
     return &m_tree_stack[level];
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T* get_value(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE T* get_value(SizeType level)
   {
 #if __cplusplus >= 201703L && !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     // TODO: check that launder is supported in device code
@@ -254,10 +234,10 @@ struct BinaryTreeReduce
 
 
 template <typename T, typename BinaryOp>
-using HighAccuracyReduce = std::conditional_t<
-    RAJA::operators::is_fp_associative<T>::value,
-      BinaryTreeReduce<T, BinaryOp>,
-      LeftFoldReduce<T, BinaryOp>>;
+using HighAccuracyReduce =
+    std::conditional_t<RAJA::operators::is_fp_associative<T>::value,
+                       BinaryTreeReduce<T, BinaryOp>,
+                       LeftFoldReduce<T, BinaryOp>>;
 
 
 /*!
@@ -265,18 +245,14 @@ using HighAccuracyReduce = std::conditional_t<
            operation using O(N) operations and O(1) memory
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T left_fold_reduce(Iter begin,
-                   Iter end,
-                   T init,
-                   BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+left_fold_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   LeftFoldReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
   for (; begin != end; ++begin) {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -290,20 +266,17 @@ T left_fold_reduce(Iter begin,
     floating point types.
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T binary_tree_reduce(Iter begin,
-                     Iter end,
-                     T init,
-                     BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+binary_tree_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   using std::distance;
   using SizeType = std::make_unsigned_t<decltype(distance(begin, end))>;
-  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init), std::move(op));
+  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init),
+                                                  std::move(op));
 
   for (; begin != end; ++begin) {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -315,18 +288,14 @@ T binary_tree_reduce(Iter begin,
     is a concern, or a faster algorithm with it is not a concern
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T high_accuracy_reduce(Iter begin,
-                        Iter end,
-                        T init,
-                        BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+high_accuracy_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   HighAccuracyReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
   for (; begin != end; ++begin) {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -343,15 +312,20 @@ template <typename Container,
           typename T = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    accumulate(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+    concepts::enable_if_t<T, type_traits::is_range<Container>>
+    accumulate(Container&& c,
+               T init = BinaryOp::identity(),
+               BinaryOp op = BinaryOp{})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::left_fold_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::left_fold_reduce(begin(c),
+                                  end(c),
+                                  std::move(init),
+                                  std::move(op));
 }
 
 /*!
@@ -363,15 +337,20 @@ template <typename Container,
           typename T = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    binary_tree_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+    concepts::enable_if_t<T, type_traits::is_range<Container>>
+    binary_tree_reduce(Container&& c,
+                       T init = BinaryOp::identity(),
+                       BinaryOp op = BinaryOp{})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::binary_tree_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::binary_tree_reduce(begin(c),
+                                    end(c),
+                                    std::move(init),
+                                    std::move(op));
 }
 
 /*!
@@ -384,15 +363,20 @@ template <typename Container,
           typename T = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    high_accuracy_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+    concepts::enable_if_t<T, type_traits::is_range<Container>>
+    high_accuracy_reduce(Container&& c,
+                         T init = BinaryOp::identity(),
+                         BinaryOp op = BinaryOp{})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::high_accuracy_reduce(begin(c),
+                                      end(c),
+                                      std::move(init),
+                                      std::move(op));
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index 28a476d951..67d67505b7 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -30,152 +30,201 @@
 #if defined(RAJA_SYCL_ACTIVE)
 #include "RAJA/policy/sycl/policy.hpp"
 #endif
-#include "RAJA/policy/sequential/policy.hpp"
-#include "RAJA/policy/openmp_target/policy.hpp"
 #include "RAJA/internal/get_platform.hpp"
+#include "RAJA/policy/openmp_target/policy.hpp"
+#include "RAJA/policy/sequential/policy.hpp"
 
 namespace RAJA
 {
 
-  namespace resources
-  {
-  using namespace camp::resources;
+namespace resources
+{
+using namespace camp::resources;
 
-  template<typename e>
-  struct get_resource{
-    using type = camp::resources::Host;
-  };
+template <typename e>
+struct get_resource {
+  using type = camp::resources::Host;
+};
 
-  template<Platform>
-  struct get_resource_from_platform{
-    using type = camp::resources::Host;
-  };
+template <Platform>
+struct get_resource_from_platform {
+  using type = camp::resources::Host;
+};
 
-  template<typename ExecPol>
-  using resource_from_pol_t = typename get_resource_from_platform<detail::get_platform<ExecPol>::value>::type;
+template <typename ExecPol>
+using resource_from_pol_t = typename get_resource_from_platform<
+    detail::get_platform<ExecPol>::value>::type;
 
-  template<typename ExecPol>
-  constexpr resource_from_pol_t<ExecPol> get_default_resource() {
-    return resource_from_pol_t<ExecPol>::get_default();
-  }
+template <typename ExecPol>
+constexpr resource_from_pol_t<ExecPol> get_default_resource()
+{
+  return resource_from_pol_t<ExecPol>::get_default();
+}
 
 #if defined(RAJA_CUDA_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::cuda>{
-    using type = camp::resources::Cuda;
-  };
-
-  template<typename IterationMapping, typename IterationGetter,
-           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>{
-    using type = camp::resources::Cuda;
-  };
-
-  template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
-  struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>>{
-    using type = camp::resources::Cuda;
-  };
-
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
-           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>>{
-    using type = camp::resources::Cuda;
-  };
+template <>
+struct get_resource_from_platform<Platform::cuda> {
+  using type = camp::resources::Cuda;
+};
+
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                             IterationGetter,
+                                                             Concretizer,
+                                                             BLOCKS_PER_SM,
+                                                             Async>> {
+  using type = camp::resources::Cuda;
+};
+
+template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
+struct get_resource<
+    ::RAJA::policy::cuda::
+        cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>> {
+  using type = camp::resources::Cuda;
+};
+
+template <typename ISetIter,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+struct get_resource<
+    ExecPolicy<ISetIter,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>>> {
+  using type = camp::resources::Cuda;
+};
 #endif
 
 #if defined(RAJA_HIP_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::hip>{
-    using type = camp::resources::Hip;
-  };
-
-  template<typename IterationMapping, typename IterationGetter,
-           typename Concretizer, bool Async>
-  struct get_resource<::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>{
-    using type = camp::resources::Hip;
-  };
-
-  template <bool Async, int num_threads>
-  struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>{
-    using type = camp::resources::Hip;
-  };
-
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
-           typename Concretizer, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>{
-    using type = camp::resources::Hip;
-  };
+template <>
+struct get_resource_from_platform<Platform::hip> {
+  using type = camp::resources::Hip;
+};
+
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async>
+struct get_resource<
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>> {
+  using type = camp::resources::Hip;
+};
+
+template <bool Async, int num_threads>
+struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>> {
+  using type = camp::resources::Hip;
+};
+
+template <typename ISetIter,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async>
+struct get_resource<ExecPolicy<
+    ISetIter,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>> {
+  using type = camp::resources::Hip;
+};
 #endif
 
 #if defined(RAJA_SYCL_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::sycl>{
-    using type = camp::resources::Sycl;
-  };
-
-  template<size_t BlockSize, bool Async>
-  struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>{
-    using type = camp::resources::Sycl;
-  };
-
-  template <bool Async, int num_threads>
-  struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>>{
-    using type = camp::resources::Sycl;
-  };
-
-  template<typename ISetIter, size_t BlockSize, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>>{
-    using type = camp::resources::Sycl;
-  };
+template <>
+struct get_resource_from_platform<Platform::sycl> {
+  using type = camp::resources::Sycl;
+};
+
+template <size_t BlockSize, bool Async>
+struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>> {
+  using type = camp::resources::Sycl;
+};
+
+template <bool Async, int num_threads>
+struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>> {
+  using type = camp::resources::Sycl;
+};
+
+template <typename ISetIter, size_t BlockSize, bool Async>
+struct get_resource<
+    ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>> {
+  using type = camp::resources::Sycl;
+};
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  template<>
-  struct get_resource_from_platform<Platform::omp_target>{
-    using type = camp::resources::Omp;
-  };
-
-  template<>
-  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt>{
-    using type = camp::resources::Omp;
-  };
-
-  template<size_t ThreadsPerTeam>
-  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>{
-    using type = camp::resources::Omp;
-  };
-
-  template<typename ISetIter>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>>{
-    using type = camp::resources::Omp;
-  };
-
-  template<typename ISetIter, size_t ThreadsPerTeam>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>>{
-    using type = camp::resources::Omp;
-  };
+template <>
+struct get_resource_from_platform<Platform::omp_target> {
+  using type = camp::resources::Omp;
+};
+
+template <>
+struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt> {
+  using type = camp::resources::Omp;
+};
+
+template <size_t ThreadsPerTeam>
+struct get_resource<
+    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>> {
+  using type = camp::resources::Omp;
+};
+
+template <typename ISetIter>
+struct get_resource<
+    ExecPolicy<ISetIter,
+               ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>> {
+  using type = camp::resources::Omp;
+};
+
+template <typename ISetIter, size_t ThreadsPerTeam>
+struct get_resource<ExecPolicy<
+    ISetIter,
+    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>> {
+  using type = camp::resources::Omp;
+};
 #endif
 
-  } // end namespace resources
+}  // end namespace resources
 
-  namespace type_traits
-  {
-    template <typename T> struct is_resource : std::false_type {};
-    template <> struct is_resource<resources::Host> : std::true_type {};
+namespace type_traits
+{
+template <typename T>
+struct is_resource : std::false_type {
+};
+template <>
+struct is_resource<resources::Host> : std::true_type {
+};
 #if defined(RAJA_CUDA_ACTIVE)
-    template <> struct is_resource<resources::Cuda> : std::true_type {};
+template <>
+struct is_resource<resources::Cuda> : std::true_type {
+};
 #endif
 #if defined(RAJA_HIP_ACTIVE)
-    template <> struct is_resource<resources::Hip> : std::true_type {};
+template <>
+struct is_resource<resources::Hip> : std::true_type {
+};
 #endif
 #if defined(RAJA_SYCL_ACTIVE)
-    template <> struct is_resource<resources::Sycl> : std::true_type {};
+template <>
+struct is_resource<resources::Sycl> : std::true_type {
+};
 #endif
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-    template <> struct is_resource<resources::Omp> : std::true_type {};
+template <>
+struct is_resource<resources::Omp> : std::true_type {
+};
 #endif
-  } // end namespace type_traits
+}  // end namespace type_traits
 
 }  // end namespace RAJA
 
-#endif //RAJA_resources_HPP#
+#endif  // RAJA_resources_HPP#
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index bbec03dfe1..24fef342a4 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -18,15 +18,13 @@
 #ifndef RAJA_util_sort_HPP
 #define RAJA_util_sort_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iterator>
 #include <memory>
 
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/detail/algorithm.hpp"
-
-#include "RAJA/util/macros.hpp"
 #include "RAJA/util/concepts.hpp"
+#include "RAJA/util/macros.hpp"
 #include "RAJA/util/math.hpp"
 
 namespace RAJA
@@ -40,11 +38,9 @@ namespace detail
     and using O(N) predicate evaluations and O(1) memory
 */
 template <typename Iter, typename Predicate>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-partition(Iter begin,
-          Iter end,
-          Predicate pred)
+RAJA_HOST_DEVICE RAJA_INLINE Iter partition(Iter begin,
+                                            Iter end,
+                                            Predicate pred)
 {
   using ::RAJA::safe_iter_swap;
 
@@ -67,7 +63,8 @@ partition(Iter begin,
   }
 
   // advance through rest of list to find the next true
-  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true) {
+  for (Iter next_true = RAJA::next(first_false); next_true != end;
+       ++next_true) {
 
     // find the end of a range of falses [first_false, next_true)
     if (pred(next_true)) {
@@ -87,11 +84,9 @@ partition(Iter begin,
     and using O(N^2) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-insertion_sort(Iter begin,
-               Iter end,
-               Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void insertion_sort(Iter begin,
+                                                 Iter end,
+                                                 Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
@@ -100,7 +95,8 @@ insertion_sort(Iter begin,
   }
 
   // for each unsorted item in the right side of the range
-  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end; ++next_unsorted) {
+  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end;
+       ++next_unsorted) {
 
     // insert unsorted item into the sorted left side of the range
     for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert) {
@@ -125,30 +121,57 @@ insertion_sort(Iter begin,
 /*!
     \brief get number of strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr size_t num_shell_strides()
-{
-  return 39;
-}
+RAJA_HOST_DEVICE RAJA_INLINE constexpr size_t num_shell_strides() { return 39; }
 
 /*!
     \brief get strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr long long unsigned get_shell_stride(int i)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr long long unsigned get_shell_stride(
+    int i)
 {
   using array_type = long long unsigned[num_shell_strides()];
   return (array_type{
       // strides from M. Ciura 2001
-      1llu, 4llu, 10llu, 23llu, 57llu, 132llu, 301llu, 701llu, 1750llu,
+      1llu,
+      4llu,
+      10llu,
+      23llu,
+      57llu,
+      132llu,
+      301llu,
+      701llu,
+      1750llu,
       // extended up to 2^47 with strides[n] = floor(2.25*strides[n-1])
-      3937llu, 8858llu, 19930llu, 44842llu, 100894llu, 227011llu, 510774llu,
-      1149241llu, 2585792llu, 5818032llu, 13090572llu, 29453787llu, 66271020llu,
-      149109795llu, 335497038llu, 754868335llu, 1698453753llu, 3821520944llu,
-      8598422124llu, 19346449779llu, 43529512002llu, 97941402004llu,
-      220368154509llu, 495828347645llu, 1115613782201llu, 2510131009952llu,
-      5647794772392llu, 12707538237882llu, 28591961035234llu, 64331912329276llu
-    })[i];
+      3937llu,
+      8858llu,
+      19930llu,
+      44842llu,
+      100894llu,
+      227011llu,
+      510774llu,
+      1149241llu,
+      2585792llu,
+      5818032llu,
+      13090572llu,
+      29453787llu,
+      66271020llu,
+      149109795llu,
+      335497038llu,
+      754868335llu,
+      1698453753llu,
+      3821520944llu,
+      8598422124llu,
+      19346449779llu,
+      43529512002llu,
+      97941402004llu,
+      220368154509llu,
+      495828347645llu,
+      1115613782201llu,
+      2510131009952llu,
+      5647794772392llu,
+      12707538237882llu,
+      28591961035234llu,
+      64331912329276llu})[i];
 }
 
 /*!
@@ -156,11 +179,7 @@ constexpr long long unsigned get_shell_stride(int i)
     and using O(N^?) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-shell_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void shell_sort(Iter begin, Iter end, Compare comp)
 {
   using ::RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
@@ -188,10 +207,12 @@ shell_sort(Iter begin,
       diff_type stride = static_cast<diff_type>(get_shell_stride(i_stride));
 
       // for each unsorted item in the right side of each strided range
-      for (diff_type i_next_unsorted = stride; i_next_unsorted != n; ++i_next_unsorted) {
+      for (diff_type i_next_unsorted = stride; i_next_unsorted != n;
+           ++i_next_unsorted) {
 
         // insert unsorted item into the sorted left side of the strided range
-        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride; i_to_insert -= stride) {
+        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride;
+             i_to_insert -= stride) {
 
           Iter to_insert = begin + i_to_insert;
           Iter next_sorted = to_insert - stride;
@@ -222,12 +243,10 @@ shell_sort(Iter begin,
     and using O(lg(N)) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-heapify(Iter begin,
-        Iter root,
-        Iter end,
-        Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void heapify(Iter begin,
+                                          Iter root,
+                                          Iter end,
+                                          Compare comp)
 {
   using RAJA::safe_iter_swap;
 
@@ -235,13 +254,13 @@ heapify(Iter begin,
 
   // heapify the root node into place
   // until this is a max heap again
-  for (auto i = root - begin; 2*i+1 < N; i = root - begin) {
+  for (auto i = root - begin; 2 * i + 1 < N; i = root - begin) {
 
     // find the max item amongst the root, left child, and right child
     Iter maxit = root;
 
     // left child
-    Iter child = begin + 2*i+1;
+    Iter child = begin + 2 * i + 1;
     if (comp(*maxit, *child)) {
       maxit = child;
     }
@@ -269,11 +288,7 @@ heapify(Iter begin,
     and using O(N*lg(N)) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-heap_sort(Iter begin,
-          Iter end,
-          Compare comp)
+RAJA_HOST_DEVICE inline void heap_sort(Iter begin, Iter end, Compare comp)
 {
   using RAJA::safe_iter_swap;
 
@@ -286,7 +301,7 @@ heap_sort(Iter begin,
 
   // make range into a max heap by
   // going through nodes with children one-by-one in reverse order
-  for (Iter root = begin + (N-1)/2; root != begin; --root) {
+  for (Iter root = begin + (N - 1) / 2; root != begin; --root) {
     // heapify a sub-heap
     heapify(begin, root, end, comp);
   }
@@ -307,16 +322,14 @@ heap_sort(Iter begin,
 /*!
     \brief max recursion depth for intro sort when compiling device code.
 */
-struct intro_sort_device_max_depth
-{
+struct intro_sort_device_max_depth {
   static constexpr unsigned get() { return 4; }
 };
 
 /*!
     \brief cutoff for intro sort to use insertion sort on small ranges.
 */
-struct intro_sort_insertion_sort_cutoff
-{
+struct intro_sort_insertion_sort_cutoff {
   static constexpr size_t get() { return 16; }
 };
 
@@ -325,12 +338,10 @@ struct intro_sort_insertion_sort_cutoff
     and using O(N*lg(N)) comparisons and O(lg(N)) memory, with limited depth.
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-intro_sort_depth(Iter begin,
-                 Iter end,
-                 Compare comp,
-                 unsigned depth)
+RAJA_HOST_DEVICE inline void intro_sort_depth(Iter begin,
+                                              Iter end,
+                                              Compare comp,
+                                              unsigned depth)
 {
   using RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
@@ -359,19 +370,12 @@ intro_sort_depth(Iter begin,
 
     // use quick sort
     // choose pivot with median of 3 (N >= insertion_sort_cutoff)
-    Iter mid = begin + N/2;
-    Iter last = end-1;
-    Iter pivot = comp(*begin, *mid)
-                    ? ( comp(*mid, *last)
-                           ? mid
-                           : ( comp(*begin, *last)
-                                  ? last
-                                  : begin ) )
-                    : ( comp(*mid, *last)
-                           ? ( comp(*begin, *last)
-                                  ? begin
-                                  : last )
-                           : mid );
+    Iter mid = begin + N / 2;
+    Iter last = end - 1;
+    Iter pivot =
+        comp(*begin, *mid)
+            ? (comp(*mid, *last) ? mid : (comp(*begin, *last) ? last : begin))
+            : (comp(*mid, *last) ? (comp(*begin, *last) ? begin : last) : mid);
 
     // swap pivot to last
     if (pivot != last) {
@@ -380,7 +384,7 @@ intro_sort_depth(Iter begin,
     }
 
     // partition
-    mid = partition(begin, last, [&](Iter it){ return comp(*it, *pivot); });
+    mid = partition(begin, last, [&](Iter it) { return comp(*it, *pivot); });
 
     // swap pivot to sorted position
     if (mid != pivot) {
@@ -390,8 +394,8 @@ intro_sort_depth(Iter begin,
 
     // recurse to sort first and second parts, ignoring already sorted pivot
     // by construction pivot is always in the range [begin, last]
-    detail::intro_sort_depth(begin, pivot, comp, depth-1);
-    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth-1);
+    detail::intro_sort_depth(begin, pivot, comp, depth - 1);
+    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth - 1);
   }
 }
 
@@ -400,19 +404,16 @@ intro_sort_depth(Iter begin,
     and using O(N*lg(N)) comparisons and O(lg(N)) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-intro_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_HOST_DEVICE inline void intro_sort(Iter begin, Iter end, Compare comp)
 {
   auto N = end - begin;
 
   // set max depth to 2*lg(N)
-  unsigned max_depth = 2*RAJA::log2(N);
+  unsigned max_depth = 2 * RAJA::log2(N);
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  // limit max_depth statically in device code to allow compiler to remove recursion
+  // limit max_depth statically in device code to allow compiler to remove
+  // recursion
   if (max_depth > detail::intro_sort_device_max_depth::get()) {
     max_depth = detail::intro_sort_device_max_depth::get();
   }
@@ -426,26 +427,19 @@ intro_sort(Iter begin,
     with local range/2 copy
 */
 template <typename Iter, typename Compare>
-void
-RAJA_INLINE
-inplace_merge(  Iter first,
-                Iter middle,
-                Iter last,
-                Compare comp  )
+void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
 {
   using diff_type = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   diff_type copylen = middle - first;
 
-  if ( first == middle || middle == last )
-  {
+  if (first == middle || middle == last) {
     // at least one side empty, already sorted
     return;
   }
 
-  if ( !comp(*middle, *(middle-1)) )
-  {
+  if (!comp(*middle, *(middle - 1))) {
     // everything already in order, done
     return;
   }
@@ -455,43 +449,38 @@ inplace_merge(  Iter first,
   buf_deleter_type buf_deleter;
 
   std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-      RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, copylen * sizeof(value_type) ),
+      RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                              copylen * sizeof(value_type)),
       buf_deleter);
 
   value_type* copyarr = copy_buf.get();
 
   // check memory allocation worked
   if (copyarr == nullptr) {
-    RAJA_ABORT_OR_THROW( "inplace_merge temporary memory allocation failed" );
+    RAJA_ABORT_OR_THROW("inplace_merge temporary memory allocation failed");
   }
 
   // move construct input into buffer storage
   // use buf_deleter.size as index to keep track of objects constructed
-  for ( diff_type& cc = buf_deleter.size; cc < copylen; ++cc )
-  {
-    new(&copyarr[cc]) value_type(std::move(first[cc]));
+  for (diff_type& cc = buf_deleter.size; cc < copylen; ++cc) {
+    new (&copyarr[cc]) value_type(std::move(first[cc]));
   }
 
   // merge
-  for ( diff_type cur = 0; cur < copylen; )
-  {
-    if ( middle >= last ) // moved all second half, put copy into remainder
+  for (diff_type cur = 0; cur < copylen;) {
+    if (middle >= last)  // moved all second half, put copy into remainder
     {
-      std::move( copyarr+cur, copyarr+copylen, first );
+      std::move(copyarr + cur, copyarr + copylen, first);
       break;
-    }
-    else if ( first == middle ) // everything prior to middle is sorted, done
+    } else if (first == middle)  // everything prior to middle is sorted, done
     {
       break;
     }
 
-    if ( comp(*middle, copyarr[cur]) )
-    {
+    if (comp(*middle, copyarr[cur])) {
       *first = std::move(*middle);
       ++middle;
-    }
-    else
-    {
+    } else {
       *first = std::move(copyarr[cur]);
       ++cur;
     }
@@ -505,53 +494,45 @@ inplace_merge(  Iter first,
     while copies are outside, somewhat follows STL API
 */
 template <typename Iter1, typename Iter2, typename OutIter, typename Compare>
-//constexpr OutIter // <-- std:: return value
-void
-RAJA_INLINE
-merge_like_std( Iter1 first1,
-                Iter1 last1,
-                Iter2 first2,
-                Iter2 last2,
-                OutIter d_first,  // using this as direct access to result
-                Compare comp)
+// constexpr OutIter // <-- std:: return value
+void RAJA_INLINE
+merge_like_std(Iter1 first1,
+               Iter1 last1,
+               Iter2 first2,
+               Iter2 last2,
+               OutIter d_first,  // using this as direct access to result
+               Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if ( first1 == last2 - 1 )  // should never need to do this
+  if (first1 == last2 - 1)  // should never need to do this
   {
     return;
   }
 
-  if ( (last2 - first1) == 2 ) // only 2 elements, simple swap
+  if ((last2 - first1) == 2)  // only 2 elements, simple swap
   {
-    if ( !comp(*d_first, *(d_first+1)) )
-    {
-      safe_iter_swap( d_first, d_first+1 );
+    if (!comp(*d_first, *(d_first + 1))) {
+      safe_iter_swap(d_first, d_first + 1);
     }
     return;
   }
 
-  while ( first1 < last1 || first2 < last2 )
-  {
-    if ( first1 >= last1 ) // first half done
+  while (first1 < last1 || first2 < last2) {
+    if (first1 >= last1)  // first half done
     {
       *d_first = std::move(*first2);
       ++first2;
-    }
-    else if ( first2 >= last2 )  // second half done
+    } else if (first2 >= last2)  // second half done
     {
       *d_first = std::move(*first1);
       ++first1;
-    }
-    else  // neither half done
+    } else  // neither half done
     {
-      if ( comp( *first2, *first1 ) )
-      {
+      if (comp(*first2, *first1)) {
         *d_first = std::move(*first2);
         ++first2;
-      }
-      else
-      {
+      } else {
         *d_first = std::move(*first1);
         ++first1;
       }
@@ -568,11 +549,7 @@ merge_like_std( Iter1 first1,
     and using O(N*lg(N)) comparisons and O(N) memory
 */
 template <typename Iter, typename Compare>
-RAJA_INLINE
-void
-merge_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
 {
   using diff_type = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
@@ -580,22 +557,18 @@ merge_sort(Iter begin,
   // iterative mergesort (bottom up) for future parallelism
 
   // min helper
-  auto minlam = [] (diff_type a, diff_type b) {return (a < b) ? a : b;};
+  auto minlam = [](diff_type a, diff_type b) { return (a < b) ? a : b; };
 
   // insertion sort for sizes <= 16
   diff_type len = end - begin;
   static constexpr diff_type insertion_sort_cutoff = 16;
-  if ( len <= insertion_sort_cutoff && len > 0 )
-  {
-    detail::insertion_sort( begin, end, comp );
-  }
-  else
-  {
+  if (len <= insertion_sort_cutoff && len > 0) {
+    detail::insertion_sort(begin, end, comp);
+  } else {
     // insertion sort on 16-element chunks, then merge
-    for ( diff_type start = 0; start < len; start += insertion_sort_cutoff )
-    {
-      diff_type lastchunk = minlam( insertion_sort_cutoff, len - start );
-      detail::insertion_sort( begin + start, begin + start + lastchunk, comp );
+    for (diff_type start = 0; start < len; start += insertion_sort_cutoff) {
+      diff_type lastchunk = minlam(insertion_sort_cutoff, len - start);
+      detail::insertion_sort(begin + start, begin + start + lastchunk, comp);
     }
 
     // merge using extra storage
@@ -605,74 +578,82 @@ merge_sort(Iter begin,
     buf_deleter_type buf_deleter;
 
     std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-        RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, len * sizeof(value_type) ),
+        RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                                len * sizeof(value_type)),
         buf_deleter);
 
     value_type* copyarr = copy_buf.get();
 
     // check memory allocation worked
     if (copyarr == nullptr) {
-      RAJA_ABORT_OR_THROW( "merge_sort temporary memory allocation failed" );
+      RAJA_ABORT_OR_THROW("merge_sort temporary memory allocation failed");
     }
 
     // move construct input into buffer storage
     // use buf_deleter.size as index to keep track of objects constructed
-    for ( diff_type& cc = buf_deleter.size; cc < len; ++cc )
-    {
-      new(&copyarr[cc]) value_type(std::move(begin[cc]));
+    for (diff_type& cc = buf_deleter.size; cc < len; ++cc) {
+      new (&copyarr[cc]) value_type(std::move(begin[cc]));
     }
 
     bool copyvalid = true;
-    //for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log n) loop
-    for ( diff_type midpoint = 16; midpoint < len; midpoint *= 2 )  // O(log n) loop
+    // for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log
+    // n) loop
+    for (diff_type midpoint = 16; midpoint < len;
+         midpoint *= 2)  // O(log n) loop
     {
-      for ( diff_type start = 0; start < len; start += midpoint * 2 )  // O(n) merging loop (can be parallelized)
+      for (diff_type start = 0; start < len;
+           start += midpoint * 2)  // O(n) merging loop (can be parallelized)
       {
-        diff_type finish = minlam( start + midpoint * 2, len );
-        if ( finish > len )
-        {
-          RAJA_ABORT_OR_THROW( "merge_sort invalid finish point" );  // sanity check
+        diff_type finish = minlam(start + midpoint * 2, len);
+        if (finish > len) {
+          RAJA_ABORT_OR_THROW(
+              "merge_sort invalid finish point");  // sanity check
         }
 
-        if ( start + midpoint >= len )
-        {
+        if (start + midpoint >= len) {
           // copy sorted remainder over
-          if ( copyvalid )
-          {
-            std::move( copyarr + start, copyarr + finish, begin + start );
-          }
-          else
-          {
-            std::move( begin + start, begin + finish, copyarr + start );
+          if (copyvalid) {
+            std::move(copyarr + start, copyarr + finish, begin + start);
+          } else {
+            std::move(begin + start, begin + finish, copyarr + start);
           }
           break;  // skip merge if no second half exists
         }
 
-        if ( copyvalid )  // switch arrays per level of merging to avoid copying back to copyarr
+        if (copyvalid)  // switch arrays per level of merging to avoid copying
+                        // back to copyarr
         {
-          detail::merge_like_std( copyarr + start, copyarr + start + midpoint, copyarr + start + midpoint, copyarr + finish, begin + start, comp );
-        }
-        else
-        {
-          detail::merge_like_std( begin + start, begin + start + midpoint, begin + start + midpoint, begin + finish, copyarr + start, comp );
+          detail::merge_like_std(copyarr + start,
+                                 copyarr + start + midpoint,
+                                 copyarr + start + midpoint,
+                                 copyarr + finish,
+                                 begin + start,
+                                 comp);
+        } else {
+          detail::merge_like_std(begin + start,
+                                 begin + start + midpoint,
+                                 begin + start + midpoint,
+                                 begin + finish,
+                                 copyarr + start,
+                                 comp);
         }
       }
 
-      copyvalid = !copyvalid; // switch arrays per level of merging to avoid copying back to copyarr
+      copyvalid = !copyvalid;  // switch arrays per level of merging to avoid
+                               // copying back to copyarr
     }
 
     // update copy if necessary
-    if ( copyvalid )
-    {
-      std::move( copyarr, copyarr + len, begin );
+    if (copyvalid) {
+      std::move(copyarr, copyarr + len, begin);
     }
   }
-  //else
+  // else
   //{
-      // Possible TBD: in-place mergesort
-      // Would shift (like insertion sort) when performing merge.
-      // PRO - Can use on GPU, O(1) storage required.
-      // CON - Shifting would cause slowdown O(n^2 log n).
+  //  Possible TBD: in-place mergesort
+  //  Would shift (like insertion sort) when performing merge.
+  //  PRO - Can use on GPU, O(1) storage required.
+  //  CON - Shifting would cause slowdown O(n^2 log n).
   //}
 }
 
@@ -685,9 +666,8 @@ merge_sort(Iter begin,
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-insertion_sort(Container&& c,
-               Compare comp = Compare{})
+    concepts::enable_if<type_traits::is_range<Container>>
+    insertion_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -698,7 +678,7 @@ insertion_sort(Container&& c,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
 
   if (begin_it != end_it) {
     auto next = begin_it;
@@ -715,9 +695,8 @@ insertion_sort(Container&& c,
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-shell_sort(Container&& c,
-           Compare comp = Compare{})
+    concepts::enable_if<type_traits::is_range<Container>>
+    shell_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -728,7 +707,7 @@ shell_sort(Container&& c,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
 
   if (begin_it != end_it) {
     auto next = begin_it;
@@ -745,9 +724,8 @@ shell_sort(Container&& c,
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-heap_sort(Container&& c,
-          Compare comp = Compare{})
+    concepts::enable_if<type_traits::is_range<Container>>
+    heap_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -758,7 +736,7 @@ heap_sort(Container&& c,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
 
   if (begin_it != end_it) {
     auto next = begin_it;
@@ -775,9 +753,8 @@ heap_sort(Container&& c,
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-intro_sort(Container&& c,
-           Compare comp = Compare{})
+    concepts::enable_if<type_traits::is_range<Container>>
+    intro_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -788,7 +765,7 @@ intro_sort(Container&& c,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
 
   if (begin_it != end_it) {
     auto next = begin_it;
@@ -804,10 +781,9 @@ intro_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-merge_sort(Container&& c,
-           Compare comp = Compare{})
+RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>> merge_sort(
+    Container&& c,
+    Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -818,7 +794,7 @@ merge_sort(Container&& c,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
 
   if (begin_it != end_it) {
     auto next = begin_it;
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 0674db71c4..caf5894efa 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -20,17 +20,16 @@
 #ifndef RAJA_Types_HPP
 #define RAJA_Types_HPP
 
-#include "RAJA/config.hpp"
-
 #include <cstddef>
 
+#include "RAJA/config.hpp"
+
 #if defined(RAJA_USE_COMPLEX)
 #include <complex>
 #endif
 
-#include "camp/helpers.hpp"
-
 #include "RAJA/util/macros.hpp"
+#include "camp/helpers.hpp"
 
 
 namespace RAJA
@@ -39,30 +38,17 @@ namespace RAJA
 ///
 /// Enum for named values with special usage.
 ///
-enum named_usage : int
-{
-  ignored = -1,
-  unspecified = 0
-};
+enum named_usage : int { ignored = -1, unspecified = 0 };
 
 ///
 /// Enum for named dimensions.
 ///
-enum struct named_dim : int
-{
-  x = 0,
-  y = 1,
-  z = 2
-};
+enum struct named_dim : int { x = 0, y = 1, z = 2 };
 
 ///
 /// Enum for synchronization requirements in some kernel constructs.
 ///
-enum struct kernel_sync_requirement : int
-{
-  none = 0,
-  sync = 1
-};
+enum struct kernel_sync_requirement : int { none = 0, sync = 1 };
 
 ///
 /// Classes used to indicate how to map iterations in a loop to indices.
@@ -70,16 +56,22 @@ enum struct kernel_sync_requirement : int
 namespace iteration_mapping
 {
 
-struct DirectUncheckedBase {};
-struct DirectBase {};
-struct LoopBase {};
-struct ContiguousLoopBase : LoopBase {};
-struct StridedLoopBase : LoopBase {};
-struct UnsizedLoopBase {};
-struct SizedLoopBase {};
-template < size_t t_max_iterations >
-struct SizedLoopSpecifyingBase : SizedLoopBase
-{
+struct DirectUncheckedBase {
+};
+struct DirectBase {
+};
+struct LoopBase {
+};
+struct ContiguousLoopBase : LoopBase {
+};
+struct StridedLoopBase : LoopBase {
+};
+struct UnsizedLoopBase {
+};
+struct SizedLoopBase {
+};
+template <size_t t_max_iterations>
+struct SizedLoopSpecifyingBase : SizedLoopBase {
   static constexpr size_t max_iterations = t_max_iterations;
 };
 
@@ -100,7 +92,8 @@ struct SizedLoopSpecifyingBase : SizedLoopBase
 ///   // 2 -> {2}
 ///   // 3 -> {3}
 ///
-struct DirectUnchecked : DirectUncheckedBase {};
+struct DirectUnchecked : DirectUncheckedBase {
+};
 
 ///
 /// Direct assumes the loop has enough iterations for all of the indices and
@@ -123,7 +116,8 @@ struct DirectUnchecked : DirectUncheckedBase {};
 ///   // 3 -> {3}
 ///   // 4 -> {safely-ignored}
 ///
-struct Direct : DirectBase {};
+struct Direct : DirectBase {
+};
 
 ///
 /// Contiguousloop assumes the loop has fewer iterations than indices and
@@ -150,10 +144,13 @@ struct Direct : DirectBase {};
 ///   // 1 -> {3, 4, 5}
 ///   // 2 -> {6, 7}
 ///
-template < size_t max_iterations >
-struct Contiguousloop : ContiguousLoopBase,
-    std::conditional_t<(max_iterations != named_usage::unspecified),
-                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
+template <size_t max_iterations>
+struct Contiguousloop
+    : ContiguousLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
+                         SizedLoopSpecifyingBase<max_iterations>,
+                         UnsizedLoopBase> {
+};
 
 ///
 /// StridedLoop assumes the loop has fewer iterations than indices and
@@ -180,12 +177,15 @@ struct Contiguousloop : ContiguousLoopBase,
 ///   // 1 -> {1, 4, 7}
 ///   // 2 -> {2, 5}
 ///
-template < size_t max_iterations >
-struct StridedLoop : StridedLoopBase,
-    std::conditional_t<(max_iterations != named_usage::unspecified),
-                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
+template <size_t max_iterations>
+struct StridedLoop
+    : StridedLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
+                         SizedLoopSpecifyingBase<max_iterations>,
+                         UnsizedLoopBase> {
+};
 
-} // namespace iteration_mapping
+}  // namespace iteration_mapping
 
 ///
 /// Enumeration used to indicate whether ListSegment object owns data
@@ -217,21 +217,20 @@ struct SizeList {
 /// Compile time fraction for use with integral types
 ///
 template <typename int_t, int_t numerator, int_t denominator>
-struct Fraction
-{
+struct Fraction {
   static_assert(denominator != int_t(0), "denominator must not be zero");
 
   using inverse = Fraction<int_t, denominator, numerator>;
 
-  template < typename new_int_t >
-  using rebind = Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
+  template <typename new_int_t>
+  using rebind =
+      Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
 
   static constexpr int_t multiply(int_t val) noexcept
   {
     return (val / denominator) * numerator +
            (val % denominator) * numerator / denominator;
   }
-
 };
 
 
@@ -274,7 +273,8 @@ using Complex_type = std::complex<Real_type>;
 // alignment attribute supported for versions > 12
 //
 #if __ICC >= 1300
-using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr =
+    Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 #endif
@@ -282,7 +282,8 @@ using const_TDRAReal_ptr = const TDRAReal_ptr;
 #elif defined(RAJA_COMPILER_GNU)
 
 #elif defined(RAJA_COMPILER_CLANG)
-using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr =
+    Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 
@@ -887,20 +888,20 @@ using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 #endif
 
 
-namespace detail {
+namespace detail
+{
 
 /*!
  * \brief Abstracts access to memory using normal memory accesses.
  */
-struct DefaultAccessor
-{
-  template < typename T >
+struct DefaultAccessor {
+  template <typename T>
   static RAJA_HOST_DEVICE RAJA_INLINE T get(T* ptr, size_t i)
   {
     return ptr[i];
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_HOST_DEVICE RAJA_INLINE void set(T* ptr, size_t i, T val)
   {
     ptr[i] = val;
@@ -915,8 +916,7 @@ struct DefaultAccessor
 template <typename T,
           size_t min_integer_type_size = 1,
           size_t max_integer_type_size = sizeof(unsigned long long)>
-struct AsIntegerArray
-{
+struct AsIntegerArray {
   static_assert(min_integer_type_size <= max_integer_type_size,
                 "incompatible min and max integer type size");
   using integer_type = std::conditional_t<
@@ -939,11 +939,11 @@ struct AsIntegerArray
                     sizeof(unsigned short) <= max_integer_type_size) ||
                    sizeof(unsigned char) < min_integer_type_size),
                   unsigned short,
-                  std::conditional_t<
-                      ((alignof(T) >= alignof(unsigned char) &&
-                        sizeof(unsigned char) <= max_integer_type_size)),
-                      unsigned char,
-                      void>>>>>;
+                  std::conditional_t<((alignof(T) >= alignof(unsigned char) &&
+                                       sizeof(unsigned char) <=
+                                           max_integer_type_size)),
+                                     unsigned char,
+                                     void>>>>>;
   static_assert(!std::is_same<integer_type, void>::value,
                 "could not find a compatible integer type");
   static_assert(sizeof(integer_type) >= min_integer_type_size,
@@ -982,31 +982,25 @@ struct AsIntegerArray
  * value at the end of the current scope.
  */
 template <typename T>
-struct ScopedAssignment
-{
+struct ScopedAssignment {
   ScopedAssignment(T& val, T const& new_val)
-    : m_ref_to_val(val)
-    , m_prev_val(std::move(val))
+      : m_ref_to_val(val), m_prev_val(std::move(val))
   {
     m_ref_to_val = new_val;
   }
 
   ScopedAssignment(T& val, T&& new_val)
-    : m_ref_to_val(val)
-    , m_prev_val(std::move(val))
+      : m_ref_to_val(val), m_prev_val(std::move(val))
   {
     m_ref_to_val = std::move(new_val);
   }
 
   ScopedAssignment(ScopedAssignment const&) = delete;
-  ScopedAssignment(ScopedAssignment &&) = delete;
+  ScopedAssignment(ScopedAssignment&&) = delete;
   ScopedAssignment& operator=(ScopedAssignment const&) = delete;
-  ScopedAssignment& operator=(ScopedAssignment &&) = delete;
+  ScopedAssignment& operator=(ScopedAssignment&&) = delete;
 
-  ~ScopedAssignment()
-  {
-    m_ref_to_val = std::move(m_prev_val);
-  }
+  ~ScopedAssignment() { m_ref_to_val = std::move(m_prev_val); }
 
 private:
   T& m_ref_to_val;
diff --git a/include/RAJA/util/zip.hpp b/include/RAJA/util/zip.hpp
index 1beefeb9cc..5e17b8f757 100644
--- a/include/RAJA/util/zip.hpp
+++ b/include/RAJA/util/zip.hpp
@@ -19,16 +19,15 @@
 #ifndef RAJA_util_zip_HPP
 #define RAJA_util_zip_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/detail/algorithm.hpp"
+#include "RAJA/util/Span.hpp"
 #include "RAJA/util/camp_aliases.hpp"
 #include "RAJA/util/concepts.hpp"
 #include "RAJA/util/zip_tuple.hpp"
-#include "RAJA/util/Span.hpp"
 
 namespace RAJA
 {
@@ -37,39 +36,39 @@ namespace RAJA
     \brief ZipIterator class for simultaneously iterating over
     multiple iterators. This is not a standards compliant iterator.
 */
-template < typename ... Iters >
-struct ZipIterator
-{
-  static_assert(concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
+template <typename... Iters>
+struct ZipIterator {
+  static_assert(
+      concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
       "ZipIterator can only contain random access iterators");
   static_assert(sizeof...(Iters) > 1,
-      "ZipIterator must contain one or more iterators");
+                "ZipIterator must contain one or more iterators");
 
-  using value_type = zip_val<typename std::iterator_traits<Iters>::value_type...>;
+  using value_type =
+      zip_val<typename std::iterator_traits<Iters>::value_type...>;
   using difference_type = std::ptrdiff_t;
   using pointer = void;
   using reference = zip_ref<typename std::iterator_traits<Iters>::reference...>;
-  using creference = zip_ref<const typename std::iterator_traits<Iters>::reference...>;
+  using creference =
+      zip_ref<const typename std::iterator_traits<Iters>::reference...>;
   using iterator_category = std::random_access_iterator_tag;
 
-  RAJA_HOST_DEVICE inline ZipIterator()
-    : m_iterators()
-  {
-  }
+  RAJA_HOST_DEVICE inline ZipIterator() : m_iterators() {}
 
-  template < typename... Args,
-             typename = concepts::enable_if<type_traits::convertible_to<Args&&, Iters>...> >
+  template <typename... Args,
+            typename = concepts::enable_if<
+                type_traits::convertible_to<Args&&, Iters>...>>
   RAJA_HOST_DEVICE inline ZipIterator(Args&&... args)
-    : m_iterators(std::forward<Args>(args)...)
+      : m_iterators(std::forward<Args>(args)...)
   {
   }
 
   RAJA_HOST_DEVICE inline ZipIterator(const ZipIterator& rhs)
-    : m_iterators(rhs.m_iterators)
+      : m_iterators(rhs.m_iterators)
   {
   }
   RAJA_HOST_DEVICE inline ZipIterator(ZipIterator&& rhs)
-    : m_iterators(std::move(rhs.m_iterators))
+      : m_iterators(std::move(rhs.m_iterators))
   {
   }
 
@@ -97,11 +96,11 @@ struct ZipIterator
   }
   RAJA_HOST_DEVICE inline bool operator>(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) >  RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) > RAJA::get<0>(rhs.m_iterators);
   }
   RAJA_HOST_DEVICE inline bool operator<(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) <  RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) < RAJA::get<0>(rhs.m_iterators);
   }
   RAJA_HOST_DEVICE inline bool operator>=(const ZipIterator& rhs) const
   {
@@ -135,14 +134,12 @@ struct ZipIterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator+=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline ZipIterator& operator+=(const difference_type& rhs)
   {
     detail::zip_for_each(m_iterators, detail::PlusEq<difference_type>{rhs});
     return *this;
   }
-  RAJA_HOST_DEVICE inline ZipIterator& operator-=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline ZipIterator& operator-=(const difference_type& rhs)
   {
     detail::zip_for_each(m_iterators, detail::MinusEq<difference_type>{rhs});
     return *this;
@@ -167,9 +164,8 @@ struct ZipIterator
     tmp -= rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE friend ZipIterator operator+(
-      difference_type lhs,
-      const ZipIterator& rhs)
+  RAJA_HOST_DEVICE friend ZipIterator operator+(difference_type lhs,
+                                                const ZipIterator& rhs)
   {
     ZipIterator tmp(rhs);
     tmp += lhs;
@@ -190,7 +186,8 @@ struct ZipIterator
     return *((*this) + rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs, ZipIterator rhs)
+  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs,
+                                                     ZipIterator rhs)
   {
     detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap{});
   }
@@ -198,7 +195,7 @@ struct ZipIterator
 private:
   zip_val<camp::decay<Iters>...> m_iterators;
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_HOST_DEVICE inline reference deref_helper(camp::idx_seq<Is...>) const
   {
     return reference(*RAJA::get<Is>(m_iterators)...);
@@ -210,10 +207,8 @@ struct ZipIterator
     \brief Zip multiple iterators together to iterate them simultaneously with
     a single ZipIterator object.
 */
-template < typename... Args >
-RAJA_HOST_DEVICE
-auto zip(Args&&... args)
-  -> ZipIterator<camp::decay<Args>...>
+template <typename... Args>
+RAJA_HOST_DEVICE auto zip(Args&&... args) -> ZipIterator<camp::decay<Args>...>
 {
   return {std::forward<Args>(args)...};
 }
@@ -223,29 +218,27 @@ auto zip(Args&&... args)
     ZipIterator objects.
 */
 template <typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE
-auto zip_span(Args&&... args)
-  -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-          typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>
+RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args)
+    -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
+            typename ZipIterator<
+                detail::ContainerIter<camp::decay<Args>>...>::difference_type>
 {
   using std::begin;
   using std::end;
   return Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-              typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>(
+              typename ZipIterator<detail::ContainerIter<
+                  camp::decay<Args>>...>::difference_type>(
       zip(begin(std::forward<Args>(args))...),
-      zip(  end(std::forward<Args>(args))...));
+      zip(end(std::forward<Args>(args))...));
 }
 
 /*!
     \brief Comparator object that compares the first member
     of tuple like objects.
 */
-template < typename T, typename Compare >
-struct CompareFirst
-{
-  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_)
-    : comp(comp_)
-  { }
+template <typename T, typename Compare>
+struct CompareFirst {
+  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_) : comp(comp_) {}
 
   RAJA_HOST_DEVICE inline bool operator()(T const& lhs, T const& rhs)
   {
@@ -260,10 +253,8 @@ struct CompareFirst
     \brief Make a comparator to compare first member of tuple
     like objects of type T.
 */
-template < typename T, typename Compare >
-RAJA_HOST_DEVICE
-auto compare_first(Compare comp)
-  -> CompareFirst<T, Compare>
+template <typename T, typename Compare>
+RAJA_HOST_DEVICE auto compare_first(Compare comp) -> CompareFirst<T, Compare>
 {
   return {comp};
 }
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
index d631d4714b..bfbd31cbfa 100644
--- a/include/RAJA/util/zip_tuple.hpp
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -19,11 +19,10 @@
 #ifndef RAJA_util_zip_ref_HPP
 #define RAJA_util_zip_ref_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iostream>
 #include <type_traits>
 
+#include "RAJA/config.hpp"
 #include "RAJA/pattern/detail/algorithm.hpp"
 #include "RAJA/util/camp_aliases.hpp"
 #include "RAJA/util/concepts.hpp"
@@ -31,121 +30,125 @@
 namespace RAJA
 {
 
-template < bool is_val, typename ... Ts >
+template <bool is_val, typename... Ts>
 struct zip_tuple;
 
-template < camp::idx_t I, typename ZT >
+template <camp::idx_t I, typename ZT>
 struct zip_tuple_element;
 
-template < camp::idx_t I, bool is_val, typename ... Ts >
+template <camp::idx_t I, bool is_val, typename... Ts>
 struct zip_tuple_element<I, zip_tuple<is_val, Ts...>>
-  : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
-{ };
+    : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type> {
+};
 
-template < camp::idx_t I, typename ZT >
+template <camp::idx_t I, typename ZT>
 using zip_tuple_element_t = typename zip_tuple_element<I, ZT>::type;
 
 
 // get function declarations for zip_tuple
 // the reference type returned by get depends on the reference type
 // of the zip_tuple that get is called on
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> &
-get(zip_tuple<is_val, Ts...>      &  z) noexcept
-{ return           z .template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
-get(zip_tuple<is_val, Ts...> const&  z) noexcept
-{ return           z .template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> &&
-get(zip_tuple<is_val, Ts...>      && z) noexcept
-{ return std::move(z).template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr RAJA::zip_tuple_element_t<I,
+                                                     zip_tuple<is_val, Ts...>>&
+get(zip_tuple<is_val, Ts...>& z) noexcept
+{
+  return z.template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr RAJA::
+    zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
+    get(zip_tuple<is_val, Ts...> const& z) noexcept
+{
+  return z.template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>>&&
+get(zip_tuple<is_val, Ts...>&& z) noexcept
+{
+  return std::move(z).template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
 get(zip_tuple<is_val, Ts...> const&& z) noexcept
-{ return std::move(z).template get<I>(); }
+{
+  return std::move(z).template get<I>();
+}
 
 namespace detail
 {
 
-struct PassThrough
-{
-  template < typename T >
+struct PassThrough {
+  template <typename T>
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-    -> decltype(std::forward<T>(t))
+      -> decltype(std::forward<T>(t))
   {
     return std::forward<T>(t);
   }
 };
 
-struct Move
-{
-  template < typename T >
+struct Move {
+  template <typename T>
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-    -> decltype(std::move(t))
+      -> decltype(std::move(t))
   {
     return std::move(t);
   }
 };
 
-struct PreInc
-{
-  template< typename Iter >
+struct PreInc {
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(++std::forward<Iter>(iter))
+      -> decltype(++std::forward<Iter>(iter))
   {
     return ++std::forward<Iter>(iter);
   }
 };
 
-struct PreDec
-{
-  template< typename Iter >
+struct PreDec {
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(--std::forward<Iter>(iter))
+      -> decltype(--std::forward<Iter>(iter))
   {
     return --std::forward<Iter>(iter);
   }
 };
 
-template < typename difference_type >
-struct PlusEq
-{
+template <typename difference_type>
+struct PlusEq {
   const difference_type& rhs;
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(std::forward<Iter>(iter) += rhs)
+      -> decltype(std::forward<Iter>(iter) += rhs)
   {
     return std::forward<Iter>(iter) += rhs;
   }
 };
 
-template < typename difference_type >
-struct MinusEq
-{
+template <typename difference_type>
+struct MinusEq {
   const difference_type& rhs;
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(std::forward<Iter>(iter) -= rhs)
+      -> decltype(std::forward<Iter>(iter) -= rhs)
   {
     return std::forward<Iter>(iter) -= rhs;
   }
 };
 
-struct DeRef
-{
-  template< typename Iter >
+struct DeRef {
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(*std::forward<Iter>(iter))
+      -> decltype(*std::forward<Iter>(iter))
   {
     return *std::forward<Iter>(iter);
   }
 };
 
-struct Swap
-{
-  template< typename T0, typename T1 >
+struct Swap {
+  template <typename T0, typename T1>
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using camp::safe_swap;
@@ -154,9 +157,8 @@ struct Swap
   }
 };
 
-struct IterSwap
-{
-  template< typename T0, typename T1 >
+struct IterSwap {
+  template <typename T0, typename T1>
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using RAJA::safe_iter_swap;
@@ -169,9 +171,10 @@ struct IterSwap
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template < typename Tuple, typename F, camp::idx_t... Is >
-RAJA_HOST_DEVICE inline
-void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
+template <typename Tuple, typename F, camp::idx_t... Is>
+RAJA_HOST_DEVICE inline void zip_for_each_impl(Tuple&& t,
+                                               F&& f,
+                                               camp::idx_seq<Is...>)
 {
   camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple>(t)))...);
 }
@@ -179,51 +182,58 @@ void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template < typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is >
-RAJA_HOST_DEVICE inline
-void zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
+template <typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is>
+RAJA_HOST_DEVICE inline void zip_for_each_impl(Tuple0&& t0,
+                                               Tuple1&& t1,
+                                               F&& f,
+                                               camp::idx_seq<Is...>)
 {
-  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)), RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
+  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)),
+                                RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
 }
 
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template < typename Tuple, typename F >
-RAJA_HOST_DEVICE inline
-void zip_for_each(Tuple&& t, F&& f)
+template <typename Tuple, typename F>
+RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
 {
-  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f), typename camp::decay<Tuple>::IdxSeq{});
+  zip_for_each_impl(std::forward<Tuple>(t),
+                    std::forward<F>(f),
+                    typename camp::decay<Tuple>::IdxSeq{});
 }
 
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template < typename Tuple0, typename Tuple1, typename F >
-RAJA_HOST_DEVICE inline
-void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
+template <typename Tuple0, typename Tuple1, typename F>
+RAJA_HOST_DEVICE inline void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
 {
-  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq, typename camp::decay<Tuple1>::IdxSeq>::value,
-      "Tuple0 and Tuple1 must have the same size");
-  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1), std::forward<F>(f), typename camp::decay<Tuple0>::IdxSeq{});
+  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq,
+                             typename camp::decay<Tuple1>::IdxSeq>::value,
+                "Tuple0 and Tuple1 must have the same size");
+  zip_for_each_impl(std::forward<Tuple0>(t0),
+                    std::forward<Tuple1>(t1),
+                    std::forward<F>(f),
+                    typename camp::decay<Tuple0>::IdxSeq{});
 }
 
-} // end namespace detail
+}  // end namespace detail
 
 /*!
     \brief Tuple used by ZipIterator for storing multiple references and values.
-    Acts like a reference to its members allowing copy/move construction/assignment
-    based on the reference type of the zip_tuple.
+    Acts like a reference to its members allowing copy/move
+   construction/assignment based on the reference type of the zip_tuple.
 */
-template < bool is_val, typename ... Ts >
-struct zip_tuple
-{
+template <bool is_val, typename... Ts>
+struct zip_tuple {
   using value_type = RAJA::tuple<Ts...>;
 
-  template < typename T >
-  using opp_type = typename std::conditional< is_val,
-        typename std::add_lvalue_reference<T>::type,
-        typename std::remove_reference<T>::type >::type;
+  template <typename T>
+  using opp_type =
+      typename std::conditional<is_val,
+                                typename std::add_lvalue_reference<T>::type,
+                                typename std::remove_reference<T>::type>::type;
 
   // zip_tuple type with opposite is_val
   using opp_tuple = zip_tuple<!is_val, opp_type<Ts>...>;
@@ -232,74 +242,119 @@ struct zip_tuple
   using IdxSeq = camp::make_idx_seq_t<sizeof...(Ts)>;
 
   // constructor from types convertible to Ts
-  template < typename ... Os
-           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...> >
+  template <
+      typename... Os,
+      typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...>>
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(Os&&... os)
-    : m_tuple(std::forward<Os>(os)...) { }
+      : m_tuple(std::forward<Os>(os)...)
+  {
+  }
 
   // assignment from types convertible to Ts
-  template < typename ... Os
-           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, typename std::remove_reference<Ts>::type>...> >
+  template <typename... Os,
+            typename = concepts::enable_if<type_traits::convertible_to<
+                Os&&,
+                typename std::remove_reference<Ts>::type>...>>
   zip_tuple& assign(Os&&... os)
-  { return assign_helper(IdxSeq{}, std::forward<Os>(os)...); }
+  {
+    return assign_helper(IdxSeq{}, std::forward<Os>(os)...);
+  }
 
   // copy and move constructors
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o)
-    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o) : zip_tuple(o, IdxSeq{})
+  {
+  }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o)
-    : zip_tuple(          o , IdxSeq{}) { }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o)
-    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+      : zip_tuple(o, IdxSeq{})
+  {
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o)
+      : zip_tuple(std::move(o), IdxSeq{})
+  {
+  }  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &      o)
-  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple& o)
+  {
+    return assign_helper(o, IdxSeq{});
+  }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple const& o)
-  { return assign_helper(          o , IdxSeq{}); }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &&     o)
-  { return assign_helper(std::move(o), IdxSeq{}); }
+  {
+    return assign_helper(o, IdxSeq{});
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple&& o)
+  {
+    return assign_helper(std::move(o), IdxSeq{});
+  }
 
   // copy and move constructors from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o)
-    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o) : zip_tuple(o, IdxSeq{})
+  {
+  }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o)
-    : zip_tuple(          o , IdxSeq{}) { }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o)
-    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+      : zip_tuple(o, IdxSeq{})
+  {
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o)
+      : zip_tuple(std::move(o), IdxSeq{})
+  {
+  }  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &      o)
-  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple& o)
+  {
+    return assign_helper(o, IdxSeq{});
+  }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple const& o)
-  { return assign_helper(          o , IdxSeq{}); }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &&     o)
-  { return assign_helper(std::move(o), IdxSeq{}); }
+  {
+    return assign_helper(o, IdxSeq{});
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple&& o)
+  {
+    return assign_helper(std::move(o), IdxSeq{});
+  }
 
   // get member functions for zip_tuples
   // the reference type returned by get depends on the reference type
   // of the zip_tuple that get is called on
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> & get() & noexcept
-  { return RAJA::get<I>(m_tuple); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> const& get() const& noexcept
-  { return RAJA::get<I>(m_tuple); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> && get() && noexcept
-  { return std::move(RAJA::get<I>(m_tuple)); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> const&& get() const&& noexcept
-  { return std::move(RAJA::get<I>(m_tuple)); }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type>&
+  get() & noexcept
+  {
+    return RAJA::get<I>(m_tuple);
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type> const& get()
+      const& noexcept
+  {
+    return RAJA::get<I>(m_tuple);
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+      RAJA::tuple_element_t<I, value_type>>&&
+  get() && noexcept
+  {
+    return std::move(RAJA::get<I>(m_tuple));
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+      RAJA::tuple_element_t<I, value_type>> const&&
+  get() const&& noexcept
+  {
+    return std::move(RAJA::get<I>(m_tuple));
+  }
 
   // safe_swap that calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, zip_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     zip_tuple& rhs)
   {
     detail::zip_for_each(lhs, rhs, detail::Swap{});
   }
 
   // safe_swap for swapping zip_tuples with opposite is_val
   // calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, opp_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     opp_tuple& rhs)
   {
     detail::zip_for_each(lhs, rhs, detail::Swap{});
   }
@@ -313,67 +368,114 @@ struct zip_tuple
 private:
   // move if is_val is true, otherwise copy in move constructor
   // this allows values to be moved, and references to stay lvalue references
-  using IsValMover = typename std::conditional<is_val, detail::Move, detail::PassThrough>::type;
+  using IsValMover = typename std::
+      conditional<is_val, detail::Move, detail::PassThrough>::type;
 
   value_type m_tuple;
 
   // assignment helper from types convertible to Ts
-  template < typename ... Os, camp::idx_t ... Is >
+  template <typename... Os, camp::idx_t... Is>
   zip_tuple& assign_helper(camp::idx_seq<Is...>, Os&&... os)
-  { camp::sink(get<Is>() = std::forward<Os>(os)...); return *this; }
+  {
+    camp::sink(get<Is>() = std::forward<Os>(os)...);
+    return *this;
+  }
 
   // copy and move constructor helpers
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o,
+                                         camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...)
+  {
+  }  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &      o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &&     o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); } return *this; }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o) {
+      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    }
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o) {
+      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    }
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple&& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o) {
+      camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
+    }
+    return *this;
+  }
 
   // copy and move constructor helpers from opp_tuple type zip_tuples
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o,
+                                         camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...)
+  {
+  }  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers from opp_tuple type zip_tuples
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &      o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &&     o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); return *this; }
-
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple&& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
+    return *this;
+  }
 };
 
 // alias zip_ref to zip_tuple capable of storing references (!is_val)
-template < typename ... Ts >
+template <typename... Ts>
 using zip_ref = zip_tuple<false, Ts...>;
 
 // alias zip_val to zip_tuple suitable for storing values (is_val)
-template < typename ... Ts >
+template <typename... Ts>
 using zip_val = zip_tuple<true, Ts...>;
 
 }  // end namespace RAJA
diff --git a/scripts/lc-builds/toss4_clang-format.sh b/scripts/lc-builds/toss4_clang-format.sh
new file mode 100755
index 0000000000..be561ddd4d
--- /dev/null
+++ b/scripts/lc-builds/toss4_clang-format.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [ "$1" == "" ]; then
+  echo
+  echo "You must pass a clang compiler version number to script with "
+  echo "MAJOR VERSION NUMBER is 14 to enable the 'make style' target"
+  exit
+fi
+
+COMP_VER=$1
+shift 1
+
+BUILD_SUFFIX=lc_toss4-clang-${COMP_VER}
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.23.1
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \
+  -DBLT_CXX_STD=c++14 \
+  -C ../host-configs/lc-builds/toss4/clang_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DENABLE_BENCHMARKS=On \
+  -DCLANGFORMAT_EXECUTABLE=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang-format \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
diff --git a/src/AlignedRangeIndexSetBuilders.cpp b/src/AlignedRangeIndexSetBuilders.cpp
index d95859d71d..d868cfe867 100644
--- a/src/AlignedRangeIndexSetBuilders.cpp
+++ b/src/AlignedRangeIndexSetBuilders.cpp
@@ -18,15 +18,12 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-
 #include <iostream>
 
-#include "RAJA/index/IndexSetBuilders.hpp"
-
 #include "RAJA/index/IndexSet.hpp"
+#include "RAJA/index/IndexSetBuilders.hpp"
 #include "RAJA/index/ListSegment.hpp"
 #include "RAJA/index/RangeSegment.hpp"
-
 #include "camp/resource.hpp"
 
 namespace RAJA
@@ -149,8 +146,8 @@ void buildIndexSetAligned(
         if (lookAhead == scanVal + 1) {
           if ((inrange == 0) && ((scanVal % range_align) == 0)) {
             if (sliceCount != 0) {
-              iset.push_back(ListSegment(&indices_in[dobegin], sliceCount,
-                                          work_res));
+              iset.push_back(
+                  ListSegment(&indices_in[dobegin], sliceCount, work_res));
             }
             inrange = 1;
             dobegin = scanVal;
@@ -185,8 +182,8 @@ void buildIndexSetAligned(
           iset.push_back(RangeSegment(dobegin, dobegin + sliceCount));
         } else {
           ++sliceCount;
-          iset.push_back(ListSegment(&indices_in[dobegin], sliceCount,
-                                      work_res));
+          iset.push_back(
+              ListSegment(&indices_in[dobegin], sliceCount, work_res));
         }
       } else if (scanVal != -1) {
         iset.push_back(ListSegment(&scanVal, 1, work_res));
diff --git a/src/DepGraphNode.cpp b/src/DepGraphNode.cpp
index 176d9e855d..1601a6fca2 100644
--- a/src/DepGraphNode.cpp
+++ b/src/DepGraphNode.cpp
@@ -15,11 +15,11 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#include "RAJA/internal/DepGraphNode.hpp"
+
 #include <iostream>
 #include <string>
 
-#include "RAJA/internal/DepGraphNode.hpp"
-
 namespace RAJA
 {
 
diff --git a/src/KokkosPluginLoader.cpp b/src/KokkosPluginLoader.cpp
index fa05e0faf8..80bc101ebb 100644
--- a/src/KokkosPluginLoader.cpp
+++ b/src/KokkosPluginLoader.cpp
@@ -8,75 +8,72 @@
 #include "RAJA/util/KokkosPluginLoader.hpp"
 
 #ifndef _WIN32
-#include <dlfcn.h>
 #include <dirent.h>
+#include <dlfcn.h>
 #endif
 
 const uint64_t kokkos_interface_version = 20171029;
 
 RAJA_INLINE
-bool
-isSharedObject(const std::string& filename)
+bool isSharedObject(const std::string &filename)
 {
-  return (filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+  return (filename.size() > 3 &&
+          !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
-template<typename function>
-RAJA_INLINE
-void
-getFunction(void* plugin, std::vector<function>& functions, const char* fname)
+template <typename function>
+RAJA_INLINE void getFunction(void *plugin,
+                             std::vector<function> &functions,
+                             const char *fname)
 {
-  #ifndef _WIN32
-  function func = (function) dlsym(plugin, fname);
+#ifndef _WIN32
+  function func = (function)dlsym(plugin, fname);
   if (func)
     functions.push_back(func);
   else
     printf("[KokkosPluginLoader]: dlsym failed: %s\n", dlerror());
-  #else
+#else
   RAJA_UNUSED_ARG(plugin);
   RAJA_UNUSED_ARG(functions);
   RAJA_UNUSED_ARG(fname);
-  #endif
+#endif
 }
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 KokkosPluginLoader::KokkosPluginLoader()
 {
   char *env = getenv("KOKKOS_PLUGINS");
-  if (env == nullptr)
-  {
+  if (env == nullptr) {
     return;
   }
   initDirectory(std::string(env));
 
-  for (auto &func : init_functions)
-  {
+  for (auto &func : init_functions) {
     func(0, kokkos_interface_version, 0, nullptr);
   }
 }
 
-void KokkosPluginLoader::preLaunch(const RAJA::util::PluginContext& p)
+void KokkosPluginLoader::preLaunch(const RAJA::util::PluginContext &p)
 {
-  for (auto &func : pre_functions)
-  {
+  for (auto &func : pre_functions) {
     func("", 0, &(p.kID));
   }
 }
 
-void KokkosPluginLoader::postLaunch(const RAJA::util::PluginContext& p)
+void KokkosPluginLoader::postLaunch(const RAJA::util::PluginContext &p)
 {
-  for (auto &func : post_functions)
-  {
+  for (auto &func : post_functions) {
     func(p.kID);
   }
 }
 
 void KokkosPluginLoader::finalize()
 {
-  for (auto &func : finalize_functions)
-  {
+  for (auto &func : finalize_functions) {
     func();
   }
   init_functions.clear();
@@ -88,62 +85,63 @@ void KokkosPluginLoader::finalize()
 // Initialize plugin from a shared object file specified by 'path'.
 void KokkosPluginLoader::initPlugin(const std::string &path)
 {
-  #ifndef _WIN32
+#ifndef _WIN32
   void *plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
-  if (!plugin)
-  {
+  if (!plugin) {
     printf("[KokkosPluginLoader]: dlopen failed: %s\n", dlerror());
   }
 
   // Getting and storing supported kokkos functions.
   getFunction<init_function>(plugin, init_functions, "kokkosp_init_library");
 
-  getFunction<pre_function>(plugin, pre_functions, "kokkosp_begin_parallel_for");
+  getFunction<pre_function>(plugin,
+                            pre_functions,
+                            "kokkosp_begin_parallel_for");
 
-  getFunction<post_function>(plugin, post_functions, "kokkosp_end_parallel_for");
+  getFunction<post_function>(plugin,
+                             post_functions,
+                             "kokkosp_end_parallel_for");
 
-  getFunction<finalize_function>(plugin, finalize_functions, "kokkosp_finalize_library");
-  #else
+  getFunction<finalize_function>(plugin,
+                                 finalize_functions,
+                                 "kokkosp_finalize_library");
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 // Initialize all plugins in a directory specified by 'path'.
 void KokkosPluginLoader::initDirectory(const std::string &path)
 {
-  #ifndef _WIN32
-  if (isSharedObject(path))
-  {
+#ifndef _WIN32
+  if (isSharedObject(path)) {
     initPlugin(path);
     return;
   }
-  
+
   DIR *dir;
   struct dirent *file;
 
-  if ((dir = opendir(path.c_str())) != NULL)
-  {
-    while ((file = readdir(dir)) != NULL)
-    {
-      if (isSharedObject(std::string(file->d_name)))
-      {
+  if ((dir = opendir(path.c_str())) != NULL) {
+    while ((file = readdir(dir)) != NULL) {
+      if (isSharedObject(std::string(file->d_name))) {
         initPlugin(path + "/" + file->d_name);
       }
     }
     closedir(dir);
-  }
-  else
-  {
+  } else {
     perror("[KokkosPluginLoader]: Could not open plugin directory");
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 void linkKokkosPluginLoader() {}
 
-} // end namespace util
-} // end namespace RAJA
+}  // end namespace util
+}  // end namespace RAJA
 
-static RAJA::util::PluginRegistry::add<RAJA::util::KokkosPluginLoader> P("KokkosPluginLoader", "Dynamically load plugins ported from the Kokkos library.");
+static RAJA::util::PluginRegistry::add<RAJA::util::KokkosPluginLoader> P(
+    "KokkosPluginLoader",
+    "Dynamically load plugins ported from the Kokkos library.");
diff --git a/src/LockFreeIndexSetBuilders.cpp b/src/LockFreeIndexSetBuilders.cpp
index f9ef1f51c8..975bc4068f 100644
--- a/src/LockFreeIndexSetBuilders.cpp
+++ b/src/LockFreeIndexSetBuilders.cpp
@@ -18,17 +18,13 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-
 #include <iostream>
 
-#include "RAJA/index/IndexSetBuilders.hpp"
-
 #include "RAJA/index/IndexSet.hpp"
+#include "RAJA/index/IndexSetBuilders.hpp"
 #include "RAJA/index/ListSegment.hpp"
 #include "RAJA/index/RangeSegment.hpp"
-
 #include "RAJA/internal/ThreadUtils_CPU.hpp"
-
 #include "camp/resource.hpp"
 
 namespace RAJA
@@ -38,15 +34,14 @@ namespace RAJA
  ******************************************************************************
  *
  * Generate a lock-free "block" index set (planar division) containing
- * range segments. 
+ * range segments.
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-    int fastDim,
-    int midDim,
-    int slowDim)
+void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+                                int fastDim,
+                                int midDim,
+                                int slowDim)
 {
   constexpr int PROFITABLE_ENTITY_THRESHOLD_BLOCK = 100;
 
@@ -315,8 +310,8 @@ void buildLockFreeColorIndexset(
         iset.push_back(
             RAJA::RangeSegment(workset[begin], workset[end - 1] + 1));
       } else {
-        iset.push_back(RAJA::ListSegment(&workset[begin], end - begin,
-                                         work_res));
+        iset.push_back(
+            RAJA::ListSegment(&workset[begin], end - begin, work_res));
         // printf("segment %d\n", i) ;
         // for (int j=begin; j<end; ++j) {
         //    printf("%d\n", workset[j]) ;
diff --git a/src/MemUtils_CUDA.cpp b/src/MemUtils_CUDA.cpp
index 85ead614d9..48efb8cefd 100644
--- a/src/MemUtils_CUDA.cpp
+++ b/src/MemUtils_CUDA.cpp
@@ -21,7 +21,6 @@
 #if defined(RAJA_ENABLE_CUDA)
 
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
-
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 
 
diff --git a/src/MemUtils_HIP.cpp b/src/MemUtils_HIP.cpp
index 97bd82775e..e623ad6eb5 100644
--- a/src/MemUtils_HIP.cpp
+++ b/src/MemUtils_HIP.cpp
@@ -21,7 +21,6 @@
 #if defined(RAJA_ENABLE_HIP)
 
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
-
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
 
 
diff --git a/src/PluginStrategy.cpp b/src/PluginStrategy.cpp
index e39c5718a8..eee0962fc4 100644
--- a/src/PluginStrategy.cpp
+++ b/src/PluginStrategy.cpp
@@ -9,22 +9,24 @@
 
 RAJA_INSTANTIATE_REGISTRY(PluginRegistry);
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 PluginStrategy::PluginStrategy() = default;
 
-void PluginStrategy::init(const PluginOptions&) { }
+void PluginStrategy::init(const PluginOptions&) {}
 
-void PluginStrategy::preCapture(const PluginContext&) { }
+void PluginStrategy::preCapture(const PluginContext&) {}
 
-void PluginStrategy::postCapture(const PluginContext&) { }
+void PluginStrategy::postCapture(const PluginContext&) {}
 
-void PluginStrategy::preLaunch(const PluginContext&) { }
+void PluginStrategy::preLaunch(const PluginContext&) {}
 
-void PluginStrategy::postLaunch(const PluginContext&) { }
+void PluginStrategy::postLaunch(const PluginContext&) {}
 
-void PluginStrategy::finalize() { }
+void PluginStrategy::finalize() {}
 
-}
-}
+}  // namespace util
+}  // namespace RAJA
diff --git a/src/RuntimePluginLoader.cpp b/src/RuntimePluginLoader.cpp
index 3da10cda8c..513985bcc7 100644
--- a/src/RuntimePluginLoader.cpp
+++ b/src/RuntimePluginLoader.cpp
@@ -8,75 +8,70 @@
 #include "RAJA/util/RuntimePluginLoader.hpp"
 
 #ifndef _WIN32
-#include <dlfcn.h>
 #include <dirent.h>
+#include <dlfcn.h>
 #endif
 
 RAJA_INLINE
-bool
-isSharedObject(const std::string& filename)
+bool isSharedObject(const std::string &filename)
 {
-  return (filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+  return (filename.size() > 3 &&
+          !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
-namespace RAJA {
-namespace util {
-  
+namespace RAJA
+{
+namespace util
+{
+
 RuntimePluginLoader::RuntimePluginLoader()
 {
   char *env = ::getenv("RAJA_PLUGINS");
-  if (nullptr == env)
-  {
+  if (nullptr == env) {
     return;
   }
   initDirectory(std::string(env));
 }
 
-void RuntimePluginLoader::init(const RAJA::util::PluginOptions& p)
+void RuntimePluginLoader::init(const RAJA::util::PluginOptions &p)
 {
   initDirectory(p.str);
-  for (auto &plugin : plugins)
-  {
+  for (auto &plugin : plugins) {
     plugin->init(p);
   }
 }
 
-void RuntimePluginLoader::preCapture(const RAJA::util::PluginContext& p)
+void RuntimePluginLoader::preCapture(const RAJA::util::PluginContext &p)
 {
-  for (auto &plugin : plugins)
-  {
+  for (auto &plugin : plugins) {
     plugin->preCapture(p);
   }
 }
 
-void RuntimePluginLoader::postCapture(const RAJA::util::PluginContext& p)
+void RuntimePluginLoader::postCapture(const RAJA::util::PluginContext &p)
 {
-  for (auto &plugin : plugins)
-  {
+  for (auto &plugin : plugins) {
     plugin->postCapture(p);
   }
 }
 
-void RuntimePluginLoader::preLaunch(const RAJA::util::PluginContext& p)
+void RuntimePluginLoader::preLaunch(const RAJA::util::PluginContext &p)
 {
-  for (auto &plugin : plugins)
-  {
+  for (auto &plugin : plugins) {
     plugin->preLaunch(p);
   }
 }
 
-void RuntimePluginLoader::postLaunch(const RAJA::util::PluginContext& p)
+void RuntimePluginLoader::postLaunch(const RAJA::util::PluginContext &p)
 {
-  for (auto &plugin : plugins)
-  {
+  for (auto &plugin : plugins) {
     plugin->postLaunch(p);
   }
 }
 
 void RuntimePluginLoader::finalize()
 {
-  for (auto &plugin : plugins)
-  {
+  for (auto &plugin : plugins) {
     plugin->finalize();
   }
   plugins.clear();
@@ -85,64 +80,58 @@ void RuntimePluginLoader::finalize()
 // Initialize plugin from a shared object file specified by 'path'.
 void RuntimePluginLoader::initPlugin(const std::string &path)
 {
-  #ifndef _WIN32
+#ifndef _WIN32
   void *plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
-  if (!plugin)
-  {
+  if (!plugin) {
     printf("[RuntimePluginLoader]: dlopen failed: %s\n", dlerror());
   }
 
-  RuntimePluginLoader::Parent *(*getPlugin)() = (RuntimePluginLoader::Parent * (*)()) dlsym(plugin, "getPlugin");
+  RuntimePluginLoader::Parent *(*getPlugin)() =
+      (RuntimePluginLoader::Parent * (*)()) dlsym(plugin, "getPlugin");
 
-  if (getPlugin)
-  {
-    plugins.push_back(std::unique_ptr<RuntimePluginLoader::Parent>(getPlugin()));
-  }
-  else
-  {
+  if (getPlugin) {
+    plugins.push_back(
+        std::unique_ptr<RuntimePluginLoader::Parent>(getPlugin()));
+  } else {
     printf("[RuntimePluginLoader]: dlsym failed: %s\n", dlerror());
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 // Initialize all plugins in a directory specified by 'path'.
 void RuntimePluginLoader::initDirectory(const std::string &path)
 {
-  #ifndef _WIN32
-  if (isSharedObject(path))
-  {
+#ifndef _WIN32
+  if (isSharedObject(path)) {
     initPlugin(path);
     return;
   }
-  
+
   DIR *dir;
   struct dirent *file;
 
-  if ((dir = opendir(path.c_str())) != NULL)
-  {
-    while ((file = readdir(dir)) != NULL)
-    {
-      if (isSharedObject(std::string(file->d_name)))
-      {
+  if ((dir = opendir(path.c_str())) != NULL) {
+    while ((file = readdir(dir)) != NULL) {
+      if (isSharedObject(std::string(file->d_name))) {
         initPlugin(path + "/" + file->d_name);
       }
     }
     closedir(dir);
-  }
-  else
-  {
+  } else {
     perror("[RuntimePluginLoader]: Could not open plugin directory");
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 void linkRuntimePluginLoader() {}
 
-} // end namespace util
-} // end namespace RAJA
+}  // end namespace util
+}  // end namespace RAJA
 
-static RAJA::util::PluginRegistry::add<RAJA::util::RuntimePluginLoader> P("RuntimePluginLoader", "Dynamically load RAJA plugins.");
+static RAJA::util::PluginRegistry::add<RAJA::util::RuntimePluginLoader> P(
+    "RuntimePluginLoader",
+    "Dynamically load RAJA plugins.");
diff --git a/src/TensorStats.cpp b/src/TensorStats.cpp
index b650b691f9..7eb1c46bfd 100644
--- a/src/TensorStats.cpp
+++ b/src/TensorStats.cpp
@@ -5,9 +5,10 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "RAJA/pattern/tensor/stats.hpp"
 #include <stdio.h>
 
+#include "RAJA/pattern/tensor/stats.hpp"
+
 int RAJA::tensor_stats::indent = 0;
 
 camp::idx_t RAJA::tensor_stats::num_vector_copy = 0;
@@ -49,7 +50,8 @@ camp::idx_t RAJA::tensor_stats::num_matrix_mm_multacc_row_row = 0;
 camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_col_col = 0;
 camp::idx_t RAJA::tensor_stats::num_matrix_mm_multacc_col_col = 0;
 
-void RAJA::tensor_stats::resetVectorStats(){
+void RAJA::tensor_stats::resetVectorStats()
+{
   num_vector_copy = 0;
   num_vector_copy_ctor = 0;
   num_vector_broadcast_ctor = 0;
@@ -88,9 +90,13 @@ void RAJA::tensor_stats::resetVectorStats(){
   num_matrix_mm_multacc_col_col = 0;
 }
 
-#define PRINT_STAT(STAT) if(STAT){printf("  %-32s   %ld\n", #STAT, STAT);}
+#define PRINT_STAT(STAT)                    \
+  if (STAT) {                               \
+    printf("  %-32s   %ld\n", #STAT, STAT); \
+  }
 
-void RAJA::tensor_stats::printVectorStats(){
+void RAJA::tensor_stats::printVectorStats()
+{
 
   printf("RAJA SIMD Register Statistics:\n");
 
@@ -129,5 +135,4 @@ void RAJA::tensor_stats::printVectorStats(){
   PRINT_STAT(num_matrix_mm_multacc_row_row);
   PRINT_STAT(num_matrix_mm_mult_col_col);
   PRINT_STAT(num_matrix_mm_multacc_col_col);
-
 }